# Feature Extraction & Train Test Validation Split

Bag of Words and Sequence Vectors Feature Extraction

In [7]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline

In [8]:
path_df = "./Pickles/all_articles_processed.pickle"

with open(path_df, 'rb') as data:
    articles = pickle.load(data)

In [9]:
articles.head()

Unnamed: 0,source,title,article,category,category_code
0,The Straits Times,Body found in garbage chute area of Woodlands ...,singapore man body find grind floor rubbish c...,Singapore,1
1,The Straits Times,Formula One: Thai Alexander Albon given chance...,london afp thai formula one driver alexander ...,Sports,2
2,The Straits Times,The Straits Times bags 8 wins at Asian Digital...,singapore straits time bag eight award 8th as...,Singapore,1
3,The Straits Times,Games,ready challenge try daily sudoku crossword puz...,Lifestyle,3
4,The Straits Times,Hong Kong cancels all remaining Monday flights...,hong kong bloomberg hong kong airport authori...,World,4


# Bag of Words

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

def ngram_vectorize(train_texts, train_labels, val_texts):
    """Vectorizes texts as n-gram vectors.

    1 text = 1 tf-idf vector the length of vocabulary of unigrams + bigrams.

    # Arguments
        train_texts: list, training text strings.
        train_labels: np.ndarray, training labels.
        val_texts: list, validation text strings.

    # Returns
        x_train, x_val: vectorized training and validation texts
    """
    # Create keyword arguments to pass to the 'tf-idf' vectorizer.
    kwargs = {
            'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': TOKEN_MODE,  # Split text into word tokens.
            'min_df': MIN_DOCUMENT_FREQUENCY,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    # Learn vocabulary from training texts and vectorize training texts.
    x_train = vectorizer.fit_transform(train_texts)

    # Vectorize validation texts.
    x_val = vectorizer.transform(val_texts)

    # Select top 'k' of the vectorized features.
    selector = SelectKBest(f_classif, k=min(TOP_K, x_train.shape[1]))
    selector.fit(x_train, train_labels)
    x_train = selector.transform(x_train).astype('float32')
    x_val = selector.transform(x_val).astype('float32')
    return x_train, x_val

In [11]:
bow_transformer = CountVectorizer(analyzer=articles).fit(articles['article'])
print(len(bow_transformer.vocabulary_))

  result = method(y)


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
message4 = article_text['article'][3]
print(message4)

In [None]:
bow4 = bow_transformer.transform([message4])
print(bow4)
print(bow4.shape)

In [None]:
print(bow_transformer.get_feature_names()[4073])
print(bow_transformer.get_feature_names()[9570])

In [None]:
messages_bow = bow_transformer.transform(article_text['article'])

In [None]:
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

Bag of Words methods will be used

## TD-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

In [None]:
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])

In [None]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

In [None]:
messages_tfidf

# TfidfVectorizer

In [None]:
# settings that you use for count vectorizer will go here
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(article_text['article'])

In [None]:
# get the first vector out (for the first document)
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

In [None]:
# TF-IDF object
with open('Pickles/tfidf.pickle', 'wb') as output:
    pickle.dump(messages_tfidf, output)

In [None]:
proceessedData

In [None]:
X_train, X_test, y_train, y_test = train_test_split(proceessedData['article'], proceessedData['Category_Code'], test_size=0.3, random_state=8)

In [None]:
# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

In [None]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
training_features = tfidf.fit_transform(X_train).toarray()
training_labels = y_train
print(training_features.shape)

test_features = tfidf.transform(X_test).toarray()
test_labels = y_test
print(test_features.shape)

In [None]:
#Training Features
with open('Pickles/training_features.pickle', 'wb') as output:
    pickle.dump(training_features, output)
    
#Training Labels
with open('Pickles/training_labels.pickle', 'wb') as output:
    pickle.dump(training_labels, output)
    
#Test Features
with open('Pickles/test_features.pickle', 'wb') as output:
    pickle.dump(test_features, output)
    
#Test Labels
with open('Pickles/test_labels.pickle', 'wb') as output:
    pickle.dump(test_labels, output)