# Feature Extraction & Train Test Validation Split

Bag of Words and Sequence Vectors Feature Extraction

In [1]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline

In [2]:
path_df = "./Pickles/all_articles_processed.pickle"

with open(path_df, 'rb') as data:
    articles = pickle.load(data)

In [3]:
articles.head()

Unnamed: 0,source,title,article,category,category_code
0,The Straits Times,Body found in garbage chute area of Woodlands ...,singapore man body find grind floor rubbish c...,Singapore,1
1,The Straits Times,Formula One: Thai Alexander Albon given chance...,london afp thai formula one driver alexander ...,Sports,2
2,The Straits Times,The Straits Times bags 8 wins at Asian Digital...,singapore straits time bag eight award 8th as...,Singapore,1
3,The Straits Times,Games,ready challenge try daily sudoku crossword puz...,Lifestyle,3
4,The Straits Times,Hong Kong cancels all remaining Monday flights...,hong kong bloomberg hong kong airport authori...,World,4


## Train Test Validation Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(articles['article'], articles['category_code'], test_size=0.2, random_state=1)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

In [5]:
all_article_count = len(X_train)+len(X_test)+len(X_validation)
print('Articles in Training Dataset:',len(X_train),round(len(X_train)/all_article_count,2)*100,"%")
print('Articles in Testing Dataset:',len(X_test),round(len(X_test)/all_article_count,2)*100,"%")
print('Articles in Validation Dataset:',len(X_validation),round(len(X_validation)/all_article_count,2)*100,"%")
print('Total Number of Articles:',all_article_count)

Articles in Training Dataset: 5373 80.0 %
Articles in Testing Dataset: 672 10.0 %
Articles in Validation Dataset: 672 10.0 %
Total Number of Articles: 6717


# Tf-idf Encoding (Bag of Words)

In [6]:
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

# Create keyword arguments to pass to the 'tf-idf' vectorizer.
kwargs = {
        'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
        'dtype': 'int32',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': TOKEN_MODE,  # Split text into word tokens.
        'min_df': MIN_DOCUMENT_FREQUENCY,
}

In [7]:
vectorizer = TfidfVectorizer(**kwargs)

In [8]:
# Learn vocabulary from training texts and vectorize training texts.
tdidf_features_train = vectorizer.fit_transform(X_train)

# Vectorize testing texts.
tdidf_features_test = vectorizer.transform(X_test)

# Vectorize validation texts.
tdidf_features_validation = vectorizer.transform(X_validation)



In [9]:
# Select top 'k' of the vectorized features.
selector = SelectKBest(f_classif, k=min(TOP_K, tdidf_features_train.shape[1]))
selector.fit(tdidf_features_train, y_train)

tdidf_features_train = selector.transform(tdidf_features_train).astype('float32')
print('Training Matrix:',tdidf_features_train.shape)

tdidf_features_test = selector.transform(tdidf_features_test).astype('float32')
print('Test Matrix:',tdidf_features_test.shape)

tdidf_features_validation = selector.transform(tdidf_features_validation).astype('float32')
print('Validation Matrix:',tdidf_features_validation.shape)

Training Matrix: (5373, 20000)
Test Matrix: (672, 20000)
Validation Matrix: (672, 20000)


In [10]:
#Training Features
with open('Pickles/tdidf_training_features.pickle', 'wb') as output:
    pickle.dump(tdidf_features_train, output)
    
#Training Labels
with open('Pickles/tdidf_training_labels.pickle', 'wb') as output:
    pickle.dump(y_train, output)
    
#Test Features
with open('Pickles/tdidf_test_features.pickle', 'wb') as output:
    pickle.dump(tdidf_features_test, output)
    
#Test Labels
with open('Pickles/tdidf_test_labels.pickle', 'wb') as output:
    pickle.dump(y_test, output)
    
#Test Features
with open('Pickles/tdidf_validation_features.pickle', 'wb') as output:
    pickle.dump(tdidf_features_validation, output)
    
#Test Labels
with open('Pickles/tdidf_validation_labels.pickle', 'wb') as output:
    pickle.dump(y_validation, output)

# Sequence Vectors

In [11]:
# Vectorization parameters
# Limit on the number of features. We use the top 20K features.
TOP_K = 20000

# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 500

In [12]:
# Create vocabulary with training texts.
tokenizer = text.Tokenizer(num_words=TOP_K)
tokenizer.fit_on_texts(X_train)

In [13]:
tokenizer.texts_to_sequences("Hello World")

[[11648], [4566], [17898], [17898], [], [], [9060], [], [8210], [17898], []]

In [14]:
# Learn vocabulary from training texts and vectorize training texts.
sv_features_train = tokenizer.texts_to_sequences(X_train)

# Vectorize testing texts.
sv_features_test = tokenizer.texts_to_sequences(X_test)

# Vectorize validation texts.
sv_features_validation = tokenizer.texts_to_sequences(X_validation)

In [15]:
# Get max sequence length.
max_length = len(max(sv_features_train, key=len))
if max_length > MAX_SEQUENCE_LENGTH:
    max_length = MAX_SEQUENCE_LENGTH

In [16]:
# Fix sequence length to max value. Sequences shorter than the length are
# padded in the beginning and sequences longer are truncated
# at the beginning.
sv_features_train = sequence.pad_sequences(sv_features_train, maxlen=max_length)
sv_features_test = sequence.pad_sequences(sv_features_test, maxlen=max_length)
sv_features_validation = sequence.pad_sequences(sv_features_validation, maxlen=max_length)

In [17]:
print('Sequence Vector Training Matrix:',sv_features_train.shape)
print('Sequence Vector Testing Matrix:',sv_features_test.shape)
print('Sequence Vector Validation Matrix:',sv_features_validation.shape)

Sequence Vector Training Matrix: (5373, 500)
Sequence Vector Testing Matrix: (672, 500)
Sequence Vector Validation Matrix: (672, 500)


In [18]:
#Training Features
with open('Pickles/sv_training_features.pickle', 'wb') as output:
    pickle.dump(sv_features_train, output)
    
#Training Labels
with open('Pickles/sv_training_labels.pickle', 'wb') as output:
    pickle.dump(y_train, output)
    
#Test Features
with open('Pickles/sv_test_features.pickle', 'wb') as output:
    pickle.dump(sv_features_test, output)
    
#Test Labels
with open('Pickles/sv_test_labels.pickle', 'wb') as output:
    pickle.dump(y_test, output)
    
#Test Features
with open('Pickles/sv_validation_features.pickle', 'wb') as output:
    pickle.dump(sv_features_validation, output)
    
#Test Labels
with open('Pickles/sv_validation_labels.pickle', 'wb') as output:
    pickle.dump(y_validation, output)