In [1]:
import getAllData as gd
import numpy as np
import pandas as pd
import random
import json
import re

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB, ComplementNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.util import mark_negation

In [2]:
data = pd.read_csv('merged.csv', index_col=0)
token = RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize

In [7]:
stemmed = data.iloc[1]['Summary and Review']

'Five Stars Love these shoes! Great fit, very light weight.'

In [9]:
stemmed = gd.stemData(data.copy())
bow_stem_data = gd.bow(stemmed, token, 'bow_stem.txt')

In [10]:
bow_stem_data

<145148x49764 sparse matrix of type '<class 'numpy.int64'>'
	with 3435923 stored elements in Compressed Sparse Row format>

In [11]:
X_train, X_test, y_train, y_test = train_test_split(bow_stem_data, data['Rating'], test_size=0.2, random_state=1) # Note test is never used!!
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [6]:

classifiers = [
    (LogisticRegression(max_iter=1000), 'Logistic (max_iter:1000)'),
    (MultinomialNB(), 'Multinomial NB'),
    (LinearSVC(max_iter=15000), 'Linear SVC (max_iter:15000)'),
    (KNeighborsClassifier(n_neighbors=1), 'KNN K:1'),
    (KNeighborsClassifier(n_neighbors=3), 'KNN K:3'),
    (KNeighborsClassifier(n_neighbors=5), 'KNN K:5'),
    (DecisionTreeClassifier(), 'DT (Baseline)'),
    (DecisionTreeClassifier(criterion="entropy", max_depth=3), 'DT (criterion:entropy, max_depth:3)'),
    (DecisionTreeClassifier(max_depth=5), 'DT (max_depth:5)')
    
]

for clf, name in classifiers:
    print('------------------------------------------------------')
    print(name)
    clf.fit(X_train, y_train)
    predictTrain = clf.predict(X_train)
    accuracy_score = metrics.accuracy_score(predictTrain, y_train)
    print('STEM BOW-Train ' + str(accuracy_score * 100))
    print(metrics.confusion_matrix(y_train, predictTrain))
    predictval = clf.predict(X_val)
    accuracy_score = metrics.accuracy_score(predictval, y_val)
    print('STEM BOW-Valid ' + str(accuracy_score * 100))
    print(metrics.confusion_matrix(y_val, predictval))
    predicttest = clf.predict(X_test)
    accuracy_score = metrics.accuracy_score(predicttest, y_test)
    print('STEM BOW-Test ' + str(accuracy_score * 100))
    print(metrics.confusion_matrix(y_test, predicttest))

------------------------------------------------------
Logistic (max_iter:1000)
STEM BOW-Train 94.04740033069999
[[ 6934  4240]
 [  944 74970]]
STEM BOW-Valid 89.91732690320359
[[ 1695  2027]
 [  900 24408]]
STEM BOW-Test 90.11367550809507
[[ 1716  2063]
 [  807 24444]]
------------------------------------------------------
Multinomial NB
STEM BOW-Train 90.13871027007166
[[ 5215  5959]
 [ 2629 73285]]
STEM BOW-Valid 87.86427833275921
[[ 1435  2287]
 [ 1236 24072]]
STEM BOW-Test 88.34653806407164
[[ 1525  2254]
 [ 1129 24122]]
------------------------------------------------------
Linear SVC (max_iter:15000)




STEM BOW-Train 95.92595994855778
[[ 8290  2884]
 [  664 75250]]
STEM BOW-Valid 89.08026179813986
[[ 1763  1959]
 [ 1211 24097]]
STEM BOW-Test 89.14226662073716
[[ 1793  1986]
 [ 1166 24085]]
------------------------------------------------------
KNN K:1
STEM BOW-Train 99.04349623369465
[[10886   288]
 [  545 75369]]
STEM BOW-Valid 84.97760936961764
[[ 1293  2429]
 [ 1932 23376]]
STEM BOW-Test 84.8122631760248
[[ 1330  2449]
 [ 1960 23291]]
------------------------------------------------------
KNN K:3
STEM BOW-Train 91.74283483373141
[[ 5211  5963]
 [ 1228 74686]]
STEM BOW-Valid 87.7402686875646
[[  972  2750]
 [  809 24499]]
STEM BOW-Test 87.75404753703066
[[ 1005  2774]
 [  781 24470]]
------------------------------------------------------
KNN K:5
STEM BOW-Train 89.93661583685467
[[ 3517  7657]
 [ 1107 74807]]
STEM BOW-Valid 88.03651395108508
[[  810  2912]
 [  561 24747]]
STEM BOW-Test 88.03651395108508
[[  819  2960]
 [  513 24738]]
-------------------------------------------------

In [6]:
clf = LogisticRegression(max_iter=1000).fit(X_train, y_train)
predict = clf.predict(X_val)
accuracy_score = metrics.accuracy_score(predict, y_val)
print('STEM BOW-LOGISTIC ' + str(accuracy_score * 100))
print(metrics.confusion_matrix(y_val, predict))

STEM BOW-LOGISTIC 90.23701253961693
[[ 1700  2012]
 [  822 24494]]


In [7]:
clfM = MultinomialNB().fit(X_train, y_train)
predictM = clfM.predict(X_val)
accuracy_scoreM = metrics.accuracy_score(predictM, y_val)
print('STEM BOW-Multinomial ' + str(accuracy_scoreM * 100))
print(metrics.confusion_matrix(y_val, predictM))

STEM BOW-Multinomial 88.35262505167425
[[ 1503  2209]
 [ 1172 24144]]


In [13]:
#SVM
clf = LinearSVC(max_iter=25000).fit(X_train, y_train)
clf.fit(X_train, y_train)
predictTrain = clf.predict(X_train)
accuracy_score = metrics.accuracy_score(predictTrain, y_train)
print('STEM BOW-Train ' + str(accuracy_score * 100))
print(metrics.confusion_matrix(y_train, predictTrain))
predictval = clf.predict(X_val)
accuracy_score = metrics.accuracy_score(predictval, y_val)
print('STEM BOW-Valid ' + str(accuracy_score * 100))
print(metrics.confusion_matrix(y_val, predictval))
predicttest = clf.predict(X_test)
accuracy_score = metrics.accuracy_score(predicttest, y_test)
print('STEM BOW-Test ' + str(accuracy_score * 100))
print(metrics.confusion_matrix(y_test, predicttest))

STEM BOW-Train 95.93170126768327
[[ 8294  2880]
 [  663 75251]]
STEM BOW-Valid 89.07681708577334
[[ 1762  1960]
 [ 1211 24097]]
STEM BOW-Test 89.15949018256975
[[ 1795  1984]
 [ 1163 24088]]


In [10]:
#KNN
clfK = KNeighborsClassifier(n_neighbors=1).fit(X_train, y_train)
predictK = clfK.predict(X_val)
accuracy_scoreK = metrics.accuracy_score(predictK, y_val)
print('STEM BOW-KNN-1 ' + str(accuracy_scoreK * 100))
print(metrics.confusion_matrix(y_val, predictK))

STEM BOW-KNN-1 85.7895824720959
[[ 1102  2610]
 [ 1515 23801]]


In [11]:
#KNN
clfK = KNeighborsClassifier(n_neighbors=3).fit(X_train, y_train)
predictK = clfK.predict(X_val)
accuracy_scoreK = metrics.accuracy_score(predictK, y_val)
print('STEM BOW-KNN-3 ' + str(accuracy_scoreK * 100))
print(metrics.confusion_matrix(y_val, predictK))

STEM BOW-KNN-3 87.91856138900373
[[  825  2887]
 [  620 24696]]


In [12]:
#KNN
clfK = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
predictK = clfK.predict(X_val)
accuracy_scoreK = metrics.accuracy_score(predictK, y_val)
print('STEM BOW-KNN-5 ' + str(accuracy_scoreK * 100))
print(metrics.confusion_matrix(y_val, predictK))

STEM BOW-KNN-5 87.95301088604106
[[  667  3045]
 [  452 24864]]


In [13]:
#DECISION TREE
clfD = DecisionTreeClassifier().fit(X_train, y_train)
predictD = clfD.predict(X_val)
accuracy_scoreD = metrics.accuracy_score(predictD, y_val)
print('STEM BOW-Decision Tree ' + str(accuracy_scoreD * 100))
print(metrics.confusion_matrix(y_val, predictD))

STEM BOW-Decision Tree 86.62326030039962
[[ 1599  2113]
 [ 1770 23546]]


In [14]:
#DECISION TREE
clfD = DecisionTreeClassifier(criterion="entropy", max_depth=3).fit(X_train, y_train)
predictD = clfD.predict(X_val)
accuracy_scoreD = metrics.accuracy_score(predictD, y_val)
print('STEM BOW-Decision Tree ' + str(accuracy_scoreD * 100))
print(metrics.confusion_matrix(y_val, predictD))

STEM BOW-Decision Tree 87.68430480914978
[[  311  3401]
 [  174 25142]]


In [15]:
#DECISION TREE
clfD = DecisionTreeClassifier(criterion="entropy", max_depth=3, splitter="random").fit(X_train, y_train)
predictD = clfD.predict(X_val)
accuracy_scoreD = metrics.accuracy_score(predictD, y_val)
print('STEM BOW-Decision Tree ' + str(accuracy_scoreD * 100))
print(metrics.confusion_matrix(y_val, predictD))

STEM BOW-Decision Tree 87.4638280281108
[[  266  3446]
 [  193 25123]]


In [19]:
#DECISION TREE
clfD = DecisionTreeClassifier(max_depth=5).fit(X_train, y_train)
predictD = clfD.predict(X_val)
accuracy_scoreD = metrics.accuracy_score(predictD, y_val)
print('STEM BOW-Decision Tree ' + str(accuracy_scoreD * 100))
print(metrics.confusion_matrix(y_val, predictD))

STEM BOW-Decision Tree 88.21482706352487
[[  568  3144]
 [  277 25039]]
