In [103]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


In [104]:
# Grab and process the raw data.
imdb_raw = pd.read_csv('imdb_labelled.txt', delimiter= '\t', header=None)
imdb_raw.columns = ['text', 'positive y/n']

amazon_raw = pd.read_csv('amazon_cells_labelled.txt', delimiter= '\t', header=None)
amazon_raw.columns = ['text', 'positive y/n']

yelp_raw = pd.read_csv('yelp_labelled.txt', delimiter= '\t', header=None)
yelp_raw.columns = ['text', 'positive y/n']

# joining the dataframes together
#alltext_raw = pd.concat([yelp_raw, amazon_raw, imdb_raw])
alltext_raw = amazon_raw


In [105]:
# Version 1
alltext_raw1 = alltext_raw

pro_keywords1 = ['great', 'super', 'excellent', 'sturdy', 'satisfied', 'best', 'good purchase', 'happy', 'good', 'nice', 'love', 'nice', 'comfortable', 'tremendous', 'forever', 'awesome', 'fast', 'decent', 'comfortable', 'flawless', 'helpful', 'wise', 'low price', 'easier', 'well finished', 'very well', 'promptly','amazed','well','easy','highly','again','good quality','high quality','smooth','pleasant','ly recommend', 'i like', 'y like']

for key in pro_keywords1:
    alltext_raw1[str(key)] = alltext_raw1.text.str.contains(
        str(key),
        case=False
    )

con_keywords1 = ['not happy', 'waste of', 'beware', 'disappoint', 'bad', 'worst', 'flimsy', 'junk', 'avoid', 'poor', 't buy', 't recommend', 'return', 'lacking', 'unhappy', 'rip off', 'a problem', 'the problem', 'sucks', 'dead', 'break','mistake','broke','warning','dying','died','difficult','not good','uncomfortable','ugly','refund','unfortunate','defective','crap','cumbersome','hate','worthless','t work','complain','horrible','useless', 't like']

for key in con_keywords1:
    alltext_raw1[str(key)] = alltext_raw1.text.str.contains(
        str(key),
        case=False
    )

In [106]:
# Version 2 - Using alltext_raw.sum() to calculate the total hits for each feature, and removing the lowest; cutoff for removal was < 8 in the pro, and < 7 in the con.
alltext_raw2 = alltext_raw

pro_keywords2 = ['great', 'excellent', 'best', 'happy', 'good', 'nice', 'love', 'nice', 'comfortable', 'very well','well','easy','highly','again','good quality','ly recommend']
for key in pro_keywords2:
    alltext_raw2[str(key)] = alltext_raw2.text.str.contains(
        str(key),
        case=False
    )

con_keywords2 = ['disappoint', 'bad', 'worst',  'junk', 'poor', 't buy', 'return', 'break','broke','t work','horrible','useless']

for key in con_keywords2:
    alltext_raw2[str(key)] = alltext_raw2.text.str.contains(
        str(key),
        case=False
    )



In [107]:
# Version 3 - Opposite design as version 2, removing the most frequent features (anything greater than 10 hits) to see how much of an effect it has.
alltext_raw3 = alltext_raw

pro_keywords3 = ['super', 'sturdy', 'satisfied', 'good purchase', 'tremendous', 'forever', 'awesome', 'fast', 'decent', 'flawless', 'helpful', 'wise', 'low price', 'easier', 'well finished', 'very well', 'promptly','amazed','highly','good quality','high quality','smooth','pleasant','i like', 'y like']

for key in pro_keywords2:
    alltext_raw3[str(key)] = alltext_raw3.text.str.contains(
        str(key),
        case=False
    )

con_keywords3 = ['not happy', 'waste of', 'beware', 't recommend', 'return', 'lacking', 'unhappy', 'rip off', 'a problem', 'the problem', 'sucks', 'dead', 'break','mistake','broke','warning','dying','died','difficult','not good','uncomfortable','ugly','refund','unfortunate','defective','crap','cumbersome','hate','worthless','complain','horrible','useless', 't like', 'flimsy', 'junk', 'avoid']

for key in con_keywords3:
    alltext_raw3[str(key)] = alltext_raw3.text.str.contains(
        str(key),
        case=False
    )

In [108]:
# Version 4 - using only pro keywords
alltext_raw4 = alltext_raw

pro_keywords4 = ['great', 'super', 'excellent', 'sturdy', 'satisfied', 'best', 'good purchase', 'happy', 'good', 'nice', 'love', 'nice', 'comfortable', 'tremendous', 'forever', 'awesome', 'fast', 'decent', 'comfortable', 'flawless', 'helpful', 'wise', 'low price', 'easier', 'well finished', 'very well', 'promptly','amazed','well','easy','highly','again','good quality','high quality','smooth','pleasant','ly recommend', 'i like', 'y like']

for key in pro_keywords4:
    alltext_raw4[str(key)] = alltext_raw4.text.str.contains(
        str(key),
        case=False
    )

con_keywords4 = []

for key in con_keywords4:
    alltext_raw4[str(key)] = alltext_raw4.text.str.contains(
        str(key),
        case=False
    )

In [109]:
# Version 5 - using only con keywords
alltext_raw5 = alltext_raw

pro_keywords5 = []

for key in pro_keywords5:
    alltext_raw5[str(key)] = alltext_raw5.text.str.contains(
        str(key),
        case=False
    )

con_keywords5 = ['not happy', 'waste of', 'beware', 'disappoint', 'bad', 'worst', 'flimsy', 'junk', 'avoid', 'poor', 't buy', 't recommend', 'return', 'lacking', 'unhappy', 'rip off', 'a problem', 'the problem', 'sucks', 'dead', 'break','mistake','broke','warning','dying','died','difficult','not good','uncomfortable','ugly','refund','unfortunate','defective','crap','cumbersome','hate','worthless','t work','complain','horrible','useless', 't like']

for key in con_keywords5:
    alltext_raw5[str(key)] = alltext_raw5.text.str.contains(
        str(key),
        case=False
    )

In [110]:
test = alltext_raw1

data = test[
    pro_keywords1
    + 
    con_keywords1
                  ]
target = test['positive y/n']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print('VERSION 1')
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
print(' ')

# confusion matrix
print('Confusion matrix: ')
print(confusion_matrix(target, y_pred))
print(' ')

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('Holdout:')
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))
print(' ')

# Cross Validation
print('Cross Validation: ' + str(cross_val_score(bnb, data, target, cv=10)))

VERSION 1
Number of mislabeled points out of a total 1000 points : 201
 
Confusion matrix: 
[[471  29]
 [172 328]]
 
Holdout:
With 20% Holdout: 0.77
Testing on Sample: 0.799
 
Cross Validation: [0.84 0.75 0.83 0.76 0.79 0.75 0.76 0.76 0.8  0.74]


In [111]:
test = alltext_raw2

data = test[
    pro_keywords2 
    + 
    con_keywords2
                  ]
target = test['positive y/n']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print('VERSION 2')
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
print(' ')

# confusion matrix
print('Confusion matrix: ')
print(confusion_matrix(target, y_pred))
print(' ')

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('Holdout:')
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))
print(' ')

# Cross Validation
print('Cross Validation: ' + str(cross_val_score(bnb, data, target, cv=10)))

VERSION 2
Number of mislabeled points out of a total 1000 points : 245
 
Confusion matrix: 
[[462  38]
 [207 293]]
 
Holdout:
With 20% Holdout: 0.73
Testing on Sample: 0.755
 
Cross Validation: [0.84 0.73 0.81 0.74 0.75 0.69 0.74 0.74 0.77 0.71]


In [112]:
test = alltext_raw3

data = test[
    pro_keywords3 
    + 
    con_keywords3
                  ]
target = test['positive y/n']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print('VERSION 3')
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
print(' ')

# confusion matrix
print('Confusion matrix: ')
print(confusion_matrix(target, y_pred))
print(' ')

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('Holdout:')
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))
print(' ')

# Cross Validation
print('Cross Validation: ' + str(cross_val_score(bnb, data, target, cv=10)))

VERSION 3
Number of mislabeled points out of a total 1000 points : 373
 
Confusion matrix: 
[[133 367]
 [  6 494]]
 
Holdout:
With 20% Holdout: 0.605
Testing on Sample: 0.627
 
Cross Validation: [0.58 0.56 0.63 0.65 0.72 0.63 0.59 0.6  0.64 0.6 ]


In [113]:
test = alltext_raw4

data = test[
    pro_keywords4 
    + 
    con_keywords4
                  ]
target = test['positive y/n']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print('VERSION 4')
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
print(' ')

# confusion matrix
print('Confusion matrix: ')
print(confusion_matrix(target, y_pred))
print(' ')

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('Holdout:')
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))
print(' ')

# Cross Validation
print('Cross Validation: ' + str(cross_val_score(bnb, data, target, cv=10)))

VERSION 4
Number of mislabeled points out of a total 1000 points : 227
 
Confusion matrix: 
[[455  45]
 [182 318]]
 
Holdout:
With 20% Holdout: 0.75
Testing on Sample: 0.773
 
Cross Validation: [0.85 0.73 0.81 0.74 0.77 0.73 0.7  0.75 0.78 0.7 ]


In [114]:
test = alltext_raw5

data = test[
    pro_keywords5 
    + 
    con_keywords5
                  ]
target = test['positive y/n']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print('VERSION 5')
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))
print(' ')

# confusion matrix
print('Confusion matrix: ')
print(confusion_matrix(target, y_pred))
print(' ')

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('Holdout:')
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))
print(' ')

# Cross Validation
print('Cross Validation: ' + str(cross_val_score(bnb, data, target, cv=10)))

VERSION 5
Number of mislabeled points out of a total 1000 points : 309
 
Confusion matrix: 
[[195 305]
 [  4 496]]
 
Holdout:
With 20% Holdout: 0.695
Testing on Sample: 0.691
 
Cross Validation: [0.65 0.64 0.7  0.73 0.77 0.7  0.67 0.63 0.72 0.66]


_Do any of your classifiers seem to overfit?_

Versions 3 & 5 seem to indicate that the pro-keyword classifiers with high frequencies are causing overfitting and inaccuracies by pushing up the incorrect labeling of negative reviews as positive, as shown by the single-digit inaccuracy rates in the confusion matrices for V3 & V5.  

No other significant variances in results beyond expectations that would indicate overfitting due to other factors.

_Which seem to perform the best? Why?_

V1 is, unsurprisingly, the best performer, due primarily to the fact that I over-engineered it in my first attempt, and then performed operations on it to see how things would break when I changed them.

_Which features seemed to be most impactful to performance?_

High-frequency features, those that appeared in the most reviews, had the most significant positive impact on the model, albeit with some shifting of errors from incorrectly labeled positive reviews to incorrectly labeled negative reviews.