# Trying existing models

In [2]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from string import punctuation
from sklearn import svm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
#nltk.download('perluniprops')
from nltk import ngrams
from itertools import chain

import pickle

## Fetch even data

In [3]:
with open('X_train_even.pickle', 'rb') as handle:
    X_train_even = pickle.load(handle)
    
# with open('X_test_even.pickle', 'rb') as handle:
#     X_test_even = pickle.load(handle)
    
with open('Y_train_even.pickle', 'rb') as handle:
    Y_train_even = pickle.load(handle)
    
# with open('Y_test_even.pickle', 'rb') as handle:
#     Y_test_even = pickle.load(handle)

print('Total number of even training data points: %d' %len(X_train_even))

Total number of even training data points: 199483


## Existing model on even data

#### Create a smaller data set for testing

In [4]:
lesser_number = 10000

small_Y = Y_train_even[0:lesser_number]
small_X = X_train_even[0:lesser_number]
neg_share = len(small_Y[small_Y==1])/lesser_number

print('The share of negative reviews:')
print(neg_share)
print('It is approximately even!')

The share of negative reviews:
0.4924
It is approximately even!


#### Get words from dataset

In [22]:
a = 'ZERO'
b = 'ZERO'
a is not b

False

In [9]:
with open('ix_to_word.pickle', 'rb') as handle:
    ix_to_word = pickle.load(handle)

In [33]:
small_X_text = []
for row in small_X:
    small_X_text_vec = []
    for value in row:
        tmp = ix_to_word[str(value[0])]
        if tmp != 'ZERO':
            small_X_text_vec.append(tmp)
    small_X_text.append(' '.join(small_X_text_vec))

### Create the model

In [34]:
df_X = pd.DataFrame(data={'small_X_text': small_X_text})   
print(str(small_X_text[0]))
df_X.head()

tge first bag i had was good and a nice change from chips but as i had these every day for lunch for a month i quickly tired of them the jalepeno one is the best they all leave a residue on your fingers


Unnamed: 0,small_X_text
0,tge first bag i had was good and a nice change...
1,when i first got them he gulped them down but ...
2,i've bought these at the local supermarket and...
3,this extract doesn't smell like roses at all i...
4,we used these in the hospital and several staf...


In [37]:
c = CountVectorizer(stop_words = 'english')

def text_fit(X, y, model, clf_model, coef_show=1):
    
    # Extract features for the data using 'model'
    X_c = model.fit_transform(X)
    print('# features: {}'.format(X_c.shape[1]))
    # Validation data split
    X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state=0)
    print('# train records: {}'.format(X_train.shape[0]))
    print('# test records: {}'.format(X_test.shape[0]))
    
    # Fit on the features with 'clf_model'
    clf = clf_model.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    print ('Model validation accuracy: {}'.format(acc))

    if coef_show == 1: 
        w = model.get_feature_names()
        coef = clf.coef_.tolist()[0]
        coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
        coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
        print('')
        print('-Top 20 positive-')
        print(coeff_df.head(20).to_string(index=False))
        print('')
        print('-Top 20 negative-')        
        print(coeff_df.tail(20).to_string(index=False))
    
df_X = pd.DataFrame(data={'small_X_text': small_X_text})    
X_for_fit = df_X['small_X_text']
text_fit(X_for_fit, small_Y, c, LogisticRegression())

# features: 19608
# train records: 7500
# test records: 2500
Model validation accuracy: 0.7988

-Top 20 positive-
Word  Coefficient
    perfect     1.845256
     hooked     1.837109
      yummy     1.773985
     highly     1.693832
  wonderful     1.665455
       glad     1.594868
  excellent     1.586839
    british     1.537837
         em     1.506473
        yum     1.497987
  delicious     1.491765
    pleased     1.470227
       best     1.457251
    awesome     1.442297
   enjoying     1.435469
      thank     1.394711
      great     1.323546
      table     1.280591
      loves     1.230550
complaining     1.221705

-Top 20 negative-
Word  Coefficient
          weak    -1.262285
         guess    -1.268734
disappointment    -1.287900
         hulls    -1.300982
            99    -1.305603
       expired    -1.339663
         money    -1.356984
         cheap    -1.359928
       concept    -1.379449
      thinking    -1.382627
         batch    -1.389653
          okay    -1.39

In [38]:
# Like CountVectorizer but it also applies 
tfidf = TfidfVectorizer(stop_words = 'english')

text_fit(X_for_fit, small_Y, tfidf, LogisticRegression())

# features: 19608
# train records: 7500
# test records: 2500
Model validation accuracy: 0.8136

-Top 20 positive-
Word  Coefficient
    great     6.599408
     best     4.992030
     love     4.488849
  perfect     4.289590
delicious     4.187150
    loves     3.443353
wonderful     3.419988
excellent     3.177714
   highly     3.157504
 favorite     2.991713
     nice     2.860930
    happy     2.780284
     good     2.678220
    yummy     2.637004
     glad     2.349891
    loved     2.313805
     easy     2.263957
  pleased     2.227835
    thank     2.225404
   smooth     2.197198

-Top 20 negative-
Word  Coefficient
          won    -2.077030
  description    -2.077432
       wouldn    -2.085996
        awful    -2.094545
     terrible    -2.099308
        stale    -2.158146
        taste    -2.190664
       return    -2.209147
         weak    -2.242227
         didn    -2.250201
           ok    -2.272605
         okay    -2.346166
     horrible    -2.353214
        worst    -2.