# Trying existing models

In [2]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from string import punctuation
from sklearn import svm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
#nltk.download('perluniprops')
from nltk import ngrams
from itertools import chain

import pickle

## Fetch even data

In [3]:
with open('X_train_even.pickle', 'rb') as handle:
    X_train_even = pickle.load(handle)
    
# with open('X_test_even.pickle', 'rb') as handle:
#     X_test_even = pickle.load(handle)
    
with open('Y_train_even.pickle', 'rb') as handle:
    Y_train_even = pickle.load(handle)
    
# with open('Y_test_even.pickle', 'rb') as handle:
#     Y_test_even = pickle.load(handle)

print('Total number of even training data points: %d' %len(X_train_even))

Total number of even training data points: 199483


## Existing model on even data

#### Create a smaller data set for testing

In [4]:
lesser_number = 10000

small_Y = Y_train_even[0:lesser_number]
small_X = X_train_even[0:lesser_number]
neg_share = len(small_Y[small_Y==1])/lesser_number

print('The share of negative reviews:')
print(neg_share)
print('It is approximately even!')

The share of negative reviews:
0.4924
It is approximately even!


#### Get words from dataset

In [4]:
with open('ix_to_word.pickle', 'rb') as handle:
    ix_to_word = pickle.load(handle)

In [33]:
small_X_text = []
for row in small_X:
    small_X_text_vec = []
    for value in row:
        tmp = ix_to_word[str(value[0])]
        if tmp != 'ZERO':
            small_X_text_vec.append(tmp)
    small_X_text.append(' '.join(small_X_text_vec))

In [6]:
X_text = []
for row in X_train_even:
    X_text_vec = []
    for value in row:
        tmp = ix_to_word[str(value[0])]
        if tmp != 'ZERO':
            X_text_vec.append(tmp)
    X_text.append(' '.join(X_text_vec))

### Create the model

In [9]:
#df_X = pd.DataFrame(data={'small_X_text': small_X_text})   
#print(str(small_X_text[0]))
df_X = pd.DataFrame(data={'X_text': X_text})
df_X.head()

Unnamed: 0,X_text
0,tge first bag i had was good and a nice change...
1,when i first got them he gulped them down but ...
2,i've bought these at the local supermarket and...
3,this extract doesn't smell like roses at all i...
4,my 11 month old has been eating these since sh...


In [10]:
c = CountVectorizer(stop_words = 'english')

def text_fit(X, y, model, clf_model, coef_show=1):
    
    # Extract features for the data using 'model'
    X_c = model.fit_transform(X)
    print('# features: {}'.format(X_c.shape[1]))
    # Validation data split
    X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state=0)
    print('# train records: {}'.format(X_train.shape[0]))
    print('# test records: {}'.format(X_test.shape[0]))
    
    # Fit on the features with 'clf_model'
    clf = clf_model.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    print ('Model validation accuracy: {}'.format(acc))

    if coef_show == 1: 
        w = model.get_feature_names()
        coef = clf.coef_.tolist()[0]
        coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
        coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
        print('')
        print('-Top 20 positive-')
        print(coeff_df.head(20).to_string(index=False))
        print('')
        print('-Top 20 negative-')        
        print(coeff_df.tail(20).to_string(index=False))
     
X_for_fit = df_X['X_text']
text_fit(X_for_fit, Y_train_even, c, LogisticRegression())

# features: 37967
# train records: 149612
# test records: 49871
Model validation accuracy: 0.8515570171041287

-Top 20 positive-
Word  Coefficient
    hahaha     2.624796
    tilted     2.564642
  outright     2.343050
 impatient     2.332832
 skeptical     2.280466
   blowout     2.271859
     youve     2.247146
  perruche     2.244868
   tribute     2.224237
   holster     2.220300
      team     2.191185
    hooked     2.156622
       eko     2.107698
pleasantly     2.085768
       h20     2.052638
    ticket     2.025280
 shellfish     2.021449
    eludes     2.007253
     hears     1.984752
     scare     1.957880

-Top 20 negative-
Word  Coefficient
   oversalted    -2.189888
      whitish    -2.190605
        alley    -2.203429
     feingold    -2.213227
  unimpressed    -2.218808
        eject    -2.251041
 underwhelmed    -2.258160
        schar    -2.283353
    foolproof    -2.312021
          ick    -2.331629
disappointing    -2.397969
     doughnut    -2.423711
        wors

In [12]:
# Like CountVectorizer but it also applies 
tfidf = TfidfVectorizer(stop_words = 'english')

text_fit(X_for_fit, Y_train_even, tfidf, LogisticRegression())

# features: 37967
# train records: 149612
# test records: 49871
Model validation accuracy: 0.848669567484109

-Top 20 positive-
Word  Coefficient
     great     8.928481
      best     8.630061
 delicious     8.506898
   perfect     8.328324
 excellent     7.250155
     loves     7.135029
    highly     6.803392
 wonderful     6.534695
      love     6.330726
    hooked     6.226441
   amazing     6.186373
   awesome     5.823596
      glad     5.790674
   pleased     5.666403
pleasantly     5.512839
 skeptical     5.392021
     yummy     5.326724
  favorite     5.151990
     thank     4.971193
    smooth     4.826595

-Top 20 negative-
Word  Coefficient
           bad    -4.626346
         maybe    -4.783946
         worse    -4.939055
        return    -5.143786
         stale    -5.254722
         bland    -5.309755
    disgusting    -5.504398
       thought    -5.519244
          weak    -5.520669
        hoping    -5.706549
disappointment    -6.003089
      terrible    -6.446847
 