In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split

In [2]:
yelp_reviews = pd.read_csv('yelp_labelled.txt', delimiter="\t", header=None)
yelp_reviews.columns = ['review', 'pos']

keywords = ['good', 'best', 'wonderful', 'classic', 'gem', 'favorite', 'plus', 'yes', 'great', 'see', 'definitely', 'excellent', 'well', 'heaven', 'interesting', 'entertaining', 'lovely', 'recommend', 'again', 'loved', 'best', 'cool', 'perfect', '10', 'definitely']

for key in keywords:
    # Note that we add spacesconfusion_matrix(target, y_pred) around the key so that we're getting the word,
    # not just pattern matching.
    yelp_reviews[str(key)] = yelp_reviews.review.str.contains(
        ' ' + str(key) + ' ',
        case=True
    )

yelp_reviews['long'] = yelp_reviews.review.str.len() > 20
yelp_reviews.pos = yelp_reviews.pos.astype(bool)

data = yelp_reviews[keywords + ['long']]
target = yelp_reviews['pos']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 396


In [3]:
A = confusion_matrix(target, y_pred)
print(A)
print("Sensitivity of A: ", A[1,1] / (A[1,0]+A[1,1]))
print("Specificity of A: ", A[0,0] / (A[0,1]+A[0,0]))

[[478  22]
 [374 126]]
Sensitivity of A:  0.252
Specificity of A:  0.956


In [4]:
keywords = ['Good', 'Best', 'Wonderful', 'Classic', 'Gem', 'Favorite', 'Plus', 'Yes', 'Great', 'See', 'Definitely', 'Excellent', 'Well', 'Heaven', 'Interesting', 'Entertaining', 'Lovely', 'Recommend', 'Again', 'Loved', 'Best', 'Cool', 'Perfect', '10', 'Definitely']

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    yelp_reviews[str(key)] = yelp_reviews.review.str.contains(
        ' ' + str(key) + ' ',
        case=True
    )

yelp_reviews['long'] = yelp_reviews.review.str.len() > 20
yelp_reviews.pos = yelp_reviews.pos.astype(bool)

data = yelp_reviews[keywords + ['long']]
target = yelp_reviews['pos']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 488


In [5]:
A = confusion_matrix(target, y_pred)
print(A)
print("Sensitivity of A: ", A[1,1] / (A[1,0]+A[1,1]))
print("Specificity of A: ", A[0,0] / (A[0,1]+A[0,0]))

[[ 55 445]
 [ 43 457]]
Sensitivity of A:  0.914
Specificity of A:  0.11


In [6]:
keywords = ['Good', 'Best', 'Wonderful', 'Classic', 'Gem', 'Favorite', 'Plus', 'Yes', 'Great', 'See', 'Definitely', 'Excellent', 'Well', 'Heaven', 'Interesting', 'Entertaining', 'Lovely', 'Recommend', 'Again', 'Loved', 'Best', 'Cool', 'Perfect', '10', 'Definitely', 'good', 'best', 'wonderful', 'classic', 'gem', 'favorite', 'plus', 'yes', 'great', 'see', 'definitely', 'excellent', 'well', 'heaven', 'interesting', 'entertaining', 'lovely', 'recommend', 'again', 'loved', 'best', 'cool', 'perfect', '10', 'definitely']

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    yelp_reviews[str(key)] = yelp_reviews.review.str.contains(
        ' ' + str(key) + ' ',
        case=True
    )

yelp_reviews['long'] = yelp_reviews.review.str.len() > 20
yelp_reviews.pos = yelp_reviews.pos.astype(bool)

data = yelp_reviews[keywords + ['long']]
target = yelp_reviews['pos']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 391


In [7]:
A = confusion_matrix(target, y_pred)
print(A)
print("Sensitivity of A: ", A[1,1] / (A[1,0]+A[1,1]))
print("Specificity of A: ", A[0,0] / (A[0,1]+A[0,0]))

[[478  22]
 [369 131]]
Sensitivity of A:  0.262
Specificity of A:  0.956


In [8]:
amzn_reviews = pd.read_csv('amazon_cells_labelled.txt', delimiter="\t", header=None)
amzn_reviews.columns = ['review', 'pos']

imdb_reviews = pd.read_csv('imdb_labelled.txt', delimiter="\t", header=None)
imdb_reviews.columns = ['review', 'pos']

all_reviews = pd.concat([yelp_reviews,imdb_reviews,amzn_reviews])

In [9]:
keywords = ['Good', 'Best', 'Wonderful', 'Classic', 'Gem', 'Favorite', 'Plus', 'Yes', 'Great', 'See', 'Definitely', 'Excellent', 'Well', 'Heaven', 'Interesting', 'Entertaining', 'Lovely', 'Recommend', 'Again', 'Loved', 'Best', 'Cool', 'Perfect', '10', 'Definitely', 'good', 'best', 'wonderful', 'classic', 'gem', 'favorite', 'plus', 'yes', 'great', 'see', 'definitely', 'excellent', 'well', 'heaven', 'interesting', 'entertaining', 'lovely', 'recommend', 'again', 'loved', 'best', 'cool', 'perfect', '10', 'definitely']

for key in keywords:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    all_reviews[str(key)] = all_reviews.review.str.contains(
        ' ' + str(key) + ' ',
        case=True
    )

all_reviews['long'] = all_reviews.review.str.len() > 20
all_reviews.pos = all_reviews.pos.astype(bool)

data = all_reviews[keywords + ['long']]
target = all_reviews['pos']

# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 2748 points : 1090


In [10]:
A = confusion_matrix(target, y_pred)
print(A)
print("Sensitivity of A: ", A[1,1] / (A[1,0]+A[1,1]))
print("Specificity of A: ", A[0,0] / (A[0,1]+A[0,0]))

[[1287   75]
 [1015  371]]
Sensitivity of A:  0.267676767677
Specificity of A:  0.944933920705


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
     data, target, test_size=0.2)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(2198, 51) (2198,)
(550, 51) (550,)


In [12]:
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data
bnb.fit(X_train, y_train)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(X_train)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    X_train.shape[0],
    (y_train != y_pred).sum()
))

Number of mislabeled points out of a total 2198 points : 866


I made my classifiers similar, only varying the capitalization of the words. I was curious to see how capitalization alone would affect performance. I tested each classifier on the same dataset for the first three experiments to make results more comparable. I then combined the three review datasets into on to test the best classifier of the initial three on a larger set. 

For the first three experiements, I tested my keywords as uncapitalized, capitalized, and both. Of the three, the classifier that tested for both capitalizations performed the best, as expected. It was only slightly better than the classifier that only used uncapitalized words, which is also expected, since most words are not capitalized. The classifier than only used capitalized words performed the worst by a large margin. When I tested the classifier with both capitalizations on the combined dataset, the results were very similar overall, with the sensitivity being slightly higher and specificity slightly lower compared to the smaller set. Finally, I tested the model on a train-test-split, and it performed similarly well there. 