<center> <h2> Benchmark Models

In this section, we train and evaluate a benchmark. We will fit a simple Naive Bayes Estimator to provide a baseline for comparison. Due to SageMaker's notebook limit instances, parts of this notebook had to be run on a local machine.

In [1]:
import os, random, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('./data/cleandata/data.csv')

In [3]:
data.head()

Unnamed: 0,title,text,label,processed
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,True,budget fight loom republican flip fiscal scrip...
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,True,military accept transgender recruit monday pen...
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,True,senior republican senator let mr mueller job w...
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,True,fbi russia probe helped australian diplomat ti...
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,trump want postal service charge much amazon s...


In [4]:
# Transform label to numeric
# Just in case the word 'label' appears in the dataset & see below
data['y_label'] = data['label'].apply(lambda x : ('True'==x)*1)
del data['label']

## I. Preprocessing

In [5]:
from collections import Counter
from utils.utils_metadata import get_structure
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [101]:
def get_frequencies(data):
    """
    Returns a Counter Object where key = word and value = count
    
    Input
    -----
    data (pandas.DataFrame) must have column processed where each cell are tokens joint in single string
    
    Output
    ------
    count (Counter)
    """
    count = Counter()
    for sentence in data['processed']:
        words = sentence.split()
        count.update(words)
    return count

def extract_vocabulary(data, size = 5000, shift = 0):
    """
    Returns a dictionary which maps a word to an identification integer
    
    Input
    -----
    data (pandas.DataFrame) must have column processed where each cell are tokens joint in single string
    size (int) size of vocabulary. Default = 5000
    shift (int) start. Default = 0
    
    Output
    ------
    vocabulary (dict)
    """    
    vocabulary = {}
    count = get_frequencies(data)
    for i,word in enumerate([key for key,value in count.most_common(size)]):
        vocabulary[word] = i + shift
    return vocabulary

def get_indices(N, proportion = [0.8,0.1,0.1], seed = 1):
    """
    Returns three lists of indices
    
    Input
    -----
    N (int) the size of the dataset
    proportion (list) list of three values which should sum to 1
    seed (int) the seed for random selection
    
    Output
    ------
    three lists indtrain, indval and indtest
    """
    np.random.seed(seed)
    indices = np.random.choice(np.arange(N),replace=False,size=N)
    ind1 = int(proportion[0]*N)
    ind2 = int(proportion[0]*N) + int(proportion[1]*N)
    return indices[:ind1], indices[ind1:ind2], indices[ind2:]

def normalize(data, fixed, estimator = None):
    """
    Returns either the normalized data if the estimator != None. Otherwise, the normalization estimator.
    
    Input
    -----
    data (pandas.DataFrame) a dataframe to fit or to transform 
    fixed (list) list of columns to not transform
    estimator (trained estimator) if None, the function will fit an estimator and return it
    
    Output
    ------
    Either a transformed dataframe or a trained estimator
    """
    features = [column for column in data.columns if column not in fixed]
    if estimator == None:
        # fit and return estimator
        norm = MinMaxScaler()
        norm.fit(data[features].values)
        return norm
    else:
        data = data.copy()
        data[features] = estimator.transform(data[features].values)
        return data

def build_benchmark_data(data, size = 5000):
    """
    Returns the training, validation and test sets and a set of model artifacts
    
    Input
    -----
    data (pandas.DataFrame) the data with the processed column
    size (int) size of the vocabulary
    
    Output
    ------
    (x_train,y_train) , (x_val,y_val) , (x_test,y_test) : (tuple) features,labels
    
    AND
    
    artifacts : (dict) with the training, validation and test indices ; tf-idf estimator ; vocabulary
    """
    indtrain, indval, indtest = get_indices(len(data))
    Xtrain = data.loc[indtrain] ; Xvalidation = data.loc[indval] ; Xtest = data.loc[indtest]
    vocabulary = extract_vocabulary(Xtrain, size)
    tfidf = TfidfVectorizer(preprocessor = lambda x:x, tokenizer = lambda x:x.split(), vocabulary = vocabulary)
    
    # Fit on train and transform for validation and test
    wordtrain = pd.DataFrame(tfidf.fit_transform(Xtrain['processed']).toarray(),columns =tfidf.vocabulary_)
    wordval = pd.DataFrame(tfidf.transform(Xvalidation['processed']).toarray(),columns = tfidf.vocabulary_)
    wordtest = pd.DataFrame(tfidf.transform(Xtest['processed']).toarray(),columns = tfidf.vocabulary_)
    
    # get_structure
    Xtrain = get_structure(Xtrain).reset_index(drop=True)
    Xvalidation = get_structure(Xvalidation).reset_index(drop=True)
    Xtest = get_structure(Xtest).reset_index(drop=True)
    
    # normalize structure by fitting on train 
    fixed = ['title','text','y_label','processed']
    estimator = normalize(Xtrain, fixed = fixed, estimator = None)
    Xtrain = normalize(Xtrain, fixed = fixed, estimator = estimator).reset_index(drop=True)
    Xvalidation = normalize(Xvalidation, fixed = fixed, estimator = estimator).reset_index(drop=True)
    Xtest = normalize(Xtest, fixed = fixed, estimator = estimator).reset_index(drop=True)
    
    # dataframes
    train = pd.concat([Xtrain,wordtrain],axis =1)
    val = pd.concat([Xvalidation,wordval], axis = 1)
    test = pd.concat([Xtest,wordtest],axis = 1)
    features = [column for column in train.columns if column not in fixed]
    
    # artifacts for future use and comparison
    artifacts = {'indtrain':indtrain,
                 'indval':indval,
                 'indtest':indtest,
                 'tfidf-estimator':tfidf,
                 'vocabulary':vocabulary}
    
    return (train[features],train['y_label']),(val[features],val['y_label']),(test[features],test['y_label']),artifacts

In [102]:
(x_train,y_train),(x_val,y_val),(x_test,y_test), artifacts = build_benchmark_data(data, size = 5000)

In [103]:
print('x_train has shape {} ; y_train has shape {}'.format(x_train.shape,y_train.shape))
print('x_validation has shape {} ; y_validation has shape {}'.format(x_val.shape,y_val.shape))
print('x_test has shape {} ; y_test has shape {}'.format(x_test.shape,y_test.shape))

x_train has shape (35918, 5008) ; y_train has shape (35918,)
x_validation has shape (4489, 5008) ; y_validation has shape (4489,)
x_test has shape (4491, 5008) ; y_test has shape (4491,)


In [104]:
x_train.head()

Unnamed: 0,title_length,title_uppercase,text_lowercase,avg_sent_length,text_uppercase_count,count_(?),count_(!),count_(#),count_(@),count_(-),...,homeless,discriminate,semi,dyer,restructuring,prohibited,coordinate,centrist,sway,espionage
0,0.158672,0.115477,0.940106,0.153331,0.009709,0.0,0.0,0.0,0.0,0.009852,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.313653,0.256744,0.96871,0.13224,0.006472,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.177122,0.053156,0.903456,0.115665,0.038835,0.010638,0.0,0.0,0.0,0.014778,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.199262,0.064712,0.910928,0.1102,0.009709,0.0,0.0,0.0,0.0,0.004926,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.553506,0.270613,0.939394,0.108379,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [105]:
zeros = sum(y_train == 0)
ones = sum(y_train == 1)
print('ratio of zeros to all : ',zeros/(ones+zeros))

ratio of zeros to all :  0.521493401637


In [106]:
zeros = sum(y_val == 0)
ones = sum(y_val == 1)
print('ratio of zeros to all : ',zeros/(ones+zeros))

ratio of zeros to all :  0.525952327913


In [107]:
zeros = sum(y_test == 0)
ones = sum(y_test == 1)
print('ratio of zeros to all : ',zeros/(ones+zeros))

ratio of zeros to all :  0.531952794478


Datasets are balanced

## II. Modelling

In [108]:
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.metrics import confusion_matrix

### Naive Bayes : NB(Struct + BOW)

In [109]:
from sklearn.naive_bayes import GaussianNB

In [110]:
clf1 = GaussianNB()
clf1.fit(x_train,y_train)

GaussianNB(priors=None)

In [111]:
def get_results(ytrue,pred):
    accuracy = accuracy_score(ytrue,pred)
    precision = precision_score(ytrue,pred)
    print('accuracy:',accuracy)
    print('precision:',precision)

In [112]:
predtrain = clf1.predict(x_train)
predval = clf1.predict(x_val)
predtest = clf1.predict(x_test)

In [113]:
get_results(y_train,predtrain)

accuracy: 0.933570911521
precision: 0.951387618176


In [114]:
get_results(y_val,predval)

accuracy: 0.931833370461
precision: 0.951437066402


In [115]:
get_results(y_test,predtest)

accuracy: 0.921843687375
precision: 0.934491315136


In [116]:
tn, fp, fn, tp = confusion_matrix(y_test,prediction).ravel()

In [117]:
print('True negative count :',tn)
print('False negative count :',fn)
print('True positive count :',tp)
print('False positive count :',fp)

True negative count : 1295
False negative count : 1100
True positive count : 1002
False positive count : 1094


### Naive Bayes : NB(BOW)

In [118]:
bow_shift = 10

In [119]:
x_train.iloc[:,bow_shift:].head()

Unnamed: 0,digit_token,trump,said,state,president,would,people,year,republican,one,...,homeless,discriminate,semi,dyer,restructuring,prohibited,coordinate,centrist,sway,espionage
0,0.040589,0.037359,0.101937,0.076857,0.054462,0.055709,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.109916,0.028563,0.037687,0.071215,0.072845,0.040068,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.125568,0.123832,0.019308,0.05095,0.0,0.098482,0.027084,0.0,0.097095,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.038559,0.050877,0.048069,0.0,0.05409,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.046693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
clf2 = GaussianNB()
clf2.fit(x_train.iloc[:,bow_shift:],y_train)

GaussianNB(priors=None)

In [121]:
predtrain2 = clf2.predict(x_train.iloc[:,bow_shift:])
predval2 = clf2.predict(x_val.iloc[:,bow_shift:])
predtest2 = clf2.predict(x_test.iloc[:,bow_shift:])

In [122]:
get_results(y_train,predtrain2)

accuracy: 0.931065204076
precision: 0.947605428102


In [123]:
get_results(y_val,predval2)

accuracy: 0.92760080196
precision: 0.943433349729


In [124]:
get_results(y_test,predtest2)

accuracy: 0.919839679359
precision: 0.932472691162


In [125]:
tn2, fp2, fn2, tp2 = confusion_matrix(y_test,predtest2).ravel()

In [126]:
print('True negative count :',tn2)
print('False negative count :',fn2)
print('True positive count :',tp2)
print('False positive count :',fp2)

True negative count : 2253
False negative count : 224
True positive count : 1878
False positive count : 136


### Naive Bayes : NB(Struct)

In [127]:
x_train.iloc[:,:bow_shift].head()

Unnamed: 0,title_length,title_uppercase,text_lowercase,avg_sent_length,text_uppercase_count,count_(?),count_(!),count_(#),count_(@),count_(-)
0,0.158672,0.115477,0.940106,0.153331,0.009709,0.0,0.0,0.0,0.0,0.009852
1,0.313653,0.256744,0.96871,0.13224,0.006472,0.0,0.0,0.0,0.0,0.0
2,0.177122,0.053156,0.903456,0.115665,0.038835,0.010638,0.0,0.0,0.0,0.014778
3,0.199262,0.064712,0.910928,0.1102,0.009709,0.0,0.0,0.0,0.0,0.004926
4,0.553506,0.270613,0.939394,0.108379,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
clf3 = GaussianNB()
clf3.fit(x_train.iloc[:,:bow_shift],y_train)

GaussianNB(priors=None)

In [129]:
predtrain3 = clf3.predict(x_train.iloc[:,:bow_shift])
predval3 = clf3.predict(x_val.iloc[:,:bow_shift])
predtest3 = clf3.predict(x_test.iloc[:,:bow_shift])

In [130]:
get_results(y_train,predtrain3)

accuracy: 0.948549473801
precision: 0.919533942345


In [131]:
get_results(y_val,predval3)

accuracy: 0.954778347071
precision: 0.926829268293


In [132]:
get_results(y_test,predtest3)

accuracy: 0.947673124026
precision: 0.916183682568


## III. Saving the data & other artifacts

In [134]:
import pickle

In [137]:
artifacts.keys()

dict_keys(['indtrain', 'indval', 'indtest', 'tfidf-estimator', 'vocabulary'])

In [153]:
# you can't pickle lambda functions...
def identity(x):
    return x

def tokenizetmp(x):
    return x.split()

In [154]:
artifacts['tfidf-estimator'].set_params(**{'preprocessor':identity,'tokenizer':tokenizetmp})

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function identity at 0x7f08d6a24598>,
        smooth_idf=True, stop_words=None, strip_accents=None,
        sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenizetmp at 0x7f08d6a24d08>, use_idf=True,
        vocabulary={'digit_token': 0, 'trump': 1, 'said': 2, 'state': 3, 'president': 4, 'would': 5, 'people': 6, 'year': 7, 'republican': 8, 'one': 9, 'new': 10, 'also': 11, 'obama': 12, 'clinton': 13, 'government': 14, 'house': 15, 'say': 16, 'reuters': 17, 'time': 18, 'donald': 19, 'reference_token': 20,...g': 4994, 'prohibited': 4995, 'coordinate': 4996, 'centrist': 4997, 'sway': 4998, 'espionage': 4999})

In [155]:
with open('./data/benchmark_artifacts.pickle', 'wb') as handle:
    pickle.dump(artifacts, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Testing pickling

In [158]:
with open('./data/benchmark_artifacts.pickle', 'rb') as handle:
    testpickle = pickle.load(handle)

In [159]:
testpickle.keys()

dict_keys(['indtrain', 'indval', 'indtest', 'tfidf-estimator', 'vocabulary'])

In [160]:
testpickle['indtrain'][:10]

array([ 4528, 31727, 10937, 13470, 40814, 12702,  2736, 31730, 42479, 26780])

In [162]:
testpickle['tfidf-estimator'].transform([data.loc[0,'processed']])

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 265 stored elements in Compressed Sparse Row format>