# Feature Extraction & Train Test Validation Split

Bag of Words and Sequence Vectors Feature Extraction

In [21]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

import matplotlib.pyplot as plt
import seaborn as sns
import pickle

%matplotlib inline

In [22]:
path_df = "./Pickles/all_articles_processed.pickle"

with open(path_df, 'rb') as data:
    articles = pickle.load(data)

In [23]:
articles.head()

Unnamed: 0,source,title,article,category,length_characters,length_words,category_code
0,AsiaOne,7 factors to consider when looking for an HDB ...,whether youre buy hdb resale flat firsttimer l...,Lifestyle,9771,1769,3
1,The Straits Times,Jung Joon-young first to be charged in K-pop s...,seoul first arrest kpop scandal singer jung ...,Lifestyle,2162,324,3
2,The Straits Times,Music mogul Dr Dre gets flak over boast that h...,los angeles look talk lack class netizens im...,Lifestyle,1108,182,3
3,Channel News Asia,,think colonel sanders quirky enough apparentl...,Lifestyle,1193,207,3
4,Channel News Asia,,mexican government question louis vuittons use...,Lifestyle,1484,231,3


## Train Test Validation Split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(articles['article'], articles['category_code'], test_size=0.2, random_state=1)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size=0.5, random_state=1)

In [25]:
all_article_count = len(X_train)+len(X_test)+len(X_validation)
print('Articles in Training Dataset:',len(X_train),round(len(X_train)/all_article_count,2)*100,"%")
print('Articles in Testing Dataset:',len(X_test),round(len(X_test)/all_article_count,2)*100,"%")
print('Articles in Validation Dataset:',len(X_validation),round(len(X_validation)/all_article_count,2)*100,"%")
print('Total Number of Articles:',all_article_count)

Articles in Training Dataset: 5760 80.0 %
Articles in Testing Dataset: 720 10.0 %
Articles in Validation Dataset: 720 10.0 %
Total Number of Articles: 7200


# Tf-idf Encoding (Bag of Words)

In [26]:
# Vectorization parameters
# Range (inclusive) of n-gram sizes for tokenizing text.
NGRAM_RANGE = (1, 2)

# Limit on the number of features. We use the top 20K features.
TOP_K = 1000

# Whether text should be split into word or character n-grams.
# One of 'word', 'char'.
TOKEN_MODE = 'word'

# Minimum document/corpus frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY = 2

# Create keyword arguments to pass to the 'tf-idf' vectorizer.
kwargs = {
        'ngram_range': NGRAM_RANGE,  # Use 1-grams + 2-grams.
        'dtype': 'int32',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': TOKEN_MODE,  # Split text into word tokens.
        'max_features': TOP_K,
        'min_df': MIN_DOCUMENT_FREQUENCY,
}

In [27]:
vectorizer = TfidfVectorizer(**kwargs)

In [28]:
# Learn vocabulary from training texts and vectorize training texts.
tdidf_features_train = vectorizer.fit_transform(X_train).toarray()
print(tdidf_features_train.shape)

# Vectorize testing texts.
tdidf_features_test = vectorizer.transform(X_test).toarray()
print(tdidf_features_test.shape)

# Vectorize validation texts.
tdidf_features_validation = vectorizer.transform(X_validation).toarray()
print(tdidf_features_validation.shape)



(5760, 1000)
(720, 1000)
(720, 1000)


In [29]:
print(y_train.shape)

(5760,)


# Select top 'k' of the vectorized features.
selector = SelectKBest(f_classif, k=min(TOP_K, tdidf_features_train.shape[1]))
selector.fit(tdidf_features_train, y_train)

tdidf_features_train = selector.transform(tdidf_features_train).astype('float32')
print('Training Matrix:',tdidf_features_train.shape)

tdidf_features_test = selector.transform(tdidf_features_test).astype('float32')
print('Test Matrix:',tdidf_features_test.shape)

tdidf_features_validation = selector.transform(tdidf_features_validation).astype('float32')
print('Validation Matrix:',tdidf_features_validation.shape)

## N-Grams

In [30]:
category_mapping = {
    'Singapore': 1,
    'Sports': 2,
    'Lifestyle': 3,
    'World': 4,
    'Business': 5,
    'Technology': 6
}

In [31]:
from sklearn.feature_selection import chi2
import numpy as np

for Product, category_id in sorted(category_mapping.items()):
    features_chi2 = chi2(tdidf_features_train, y_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(vectorizer.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-5:])))
    print("")

# 'Business' category:
  . Most correlated unigrams:
. billion
. tariff
. trade
. per
. cent
  . Most correlated bigrams:
. chief executive
. president donald
. trade war
. us billion
. per cent

# 'Lifestyle' category:
  . Most correlated unigrams:
. story
. actor
. movie
. singer
. film
  . Most correlated bigrams:
. us million
. post share
. view post
. post instagram
. relate story

# 'Singapore' category:
  . Most correlated unigrams:
. man
. singaporeans
. mr
. jail
. singapore
  . Most correlated bigrams:
. world cup
. new york
. us billion
. say mr
. straits time

# 'Sports' category:
  . Most correlated unigrams:
. champion
. match
. cup
. league
. win
  . Most correlated bigrams:
. hong kong
. us billion
. per cent
. us open
. world cup

# 'Technology' category:
  . Most correlated unigrams:
. apple
. facebook
. network
. huawei
. percent
  . Most correlated bigrams:
. relate stories
. social media
. world cup
. relate story
. us billion

# 'World' category:
  . Most correlat

In [32]:
#Training Features
with open('Pickles/tdidf_training_features.pickle', 'wb') as output:
    pickle.dump(tdidf_features_train, output, protocol=4)
    
#Training Labels
with open('Pickles/tdidf_training_labels.pickle', 'wb') as output:
    pickle.dump(y_train, output, protocol=4)
    
#Test Features
with open('Pickles/tdidf_test_features.pickle', 'wb') as output:
    pickle.dump(tdidf_features_test, output, protocol=4)
    
#Test Labels
with open('Pickles/tdidf_test_labels.pickle', 'wb') as output:
    pickle.dump(y_test, output, protocol=4)
    
#Test Features
with open('Pickles/tdidf_validation_features.pickle', 'wb') as output:
    pickle.dump(tdidf_features_validation, output, protocol=4)
    
#Test Labels
with open('Pickles/tdidf_validation_labels.pickle', 'wb') as output:
    pickle.dump(y_validation, output, protocol=4)

# Sequence Vectors

In [33]:
# Vectorization parameters
# Limit on the number of features. We use the top 20K features.
TOP_K = 1000

# Limit on the length of text sequences. Sequences longer than this
# will be truncated.
MAX_SEQUENCE_LENGTH = 1000

In [34]:
# Create vocabulary with training texts.
tokenizer = text.Tokenizer(num_words=TOP_K)
tokenizer.fit_on_texts(X_train)

In [35]:
tokenizer.texts_to_sequences("Hello World")

[[], [], [], [], [], [], [], [], [], [], []]

In [36]:
# Learn vocabulary from training texts and vectorize training texts.
sv_features_train = tokenizer.texts_to_sequences(X_train)

# Vectorize testing texts.
sv_features_test = tokenizer.texts_to_sequences(X_test)

# Vectorize validation texts.
sv_features_validation = tokenizer.texts_to_sequences(X_validation)

In [37]:
# Get max sequence length.
max_length = len(max(sv_features_train, key=len))
if max_length > MAX_SEQUENCE_LENGTH:
    max_length = MAX_SEQUENCE_LENGTH

In [38]:
# Fix sequence length to max value. Sequences shorter than the length are
# padded in the beginning and sequences longer are truncated
# at the beginning.
sv_features_train = sequence.pad_sequences(sv_features_train, maxlen=max_length)
sv_features_test = sequence.pad_sequences(sv_features_test, maxlen=max_length)
sv_features_validation = sequence.pad_sequences(sv_features_validation, maxlen=max_length)

In [39]:
print('Sequence Vector Training Matrix:',sv_features_train.shape)
print('Sequence Vector Testing Matrix:',sv_features_test.shape)
print('Sequence Vector Validation Matrix:',sv_features_validation.shape)

Sequence Vector Training Matrix: (5760, 1000)
Sequence Vector Testing Matrix: (720, 1000)
Sequence Vector Validation Matrix: (720, 1000)


In [40]:
#Training Features
with open('Pickles/sv_training_features.pickle', 'wb') as output:
    pickle.dump(sv_features_train, output, protocol=4)
    
#Training Labels
with open('Pickles/sv_training_labels.pickle', 'wb') as output:
    pickle.dump(y_train, output, protocol=4)
    
#Test Features
with open('Pickles/sv_test_features.pickle', 'wb') as output:
    pickle.dump(sv_features_test, output, protocol=4)
    
#Test Labels
with open('Pickles/sv_test_labels.pickle', 'wb') as output:
    pickle.dump(y_test, output, protocol=4)
    
#Test Features
with open('Pickles/sv_validation_features.pickle', 'wb') as output:
    pickle.dump(sv_features_validation, output, protocol=4)
    
#Test Labels
with open('Pickles/sv_validation_labels.pickle', 'wb') as output:
    pickle.dump(y_validation, output, protocol=4)