In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

from sklearn.naive_bayes import BernoulliNB

In this assignment, I will build a simple sentiment analysis classifier using a Naive Bayes model.

Dataset: From UCI(https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences). Reviews are taken from yelp with equal amounts of negative and positive reviews (500/500 and no neutral comments).

In [2]:
yelp = pd.read_table('../data/UCI_sentimentlabeled/yelp_labelled.txt', header=None, names=['Comment', 'Sentiment'])

In [3]:
yelp.head()

Unnamed: 0,Comment,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
print(yelp.describe())
print('\n')
print("Positive and negative reviews are evenly split:")
print(yelp.Sentiment.value_counts())

        Sentiment
count  1000.00000
mean      0.50000
std       0.50025
min       0.00000
25%       0.00000
50%       0.50000
75%       1.00000
max       1.00000


Positive and negative reviews are evenly split:
1    500
0    500
Name: Sentiment, dtype: int64


In [5]:
yelp[yelp['Sentiment']==0]

Unnamed: 0,Comment,Sentiment
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
11,Would not go back.,0
12,The cashier had no care what so ever on what I...,0
14,I was disgusted because I was pretty sure that...,0
15,I was shocked because no signs indicate cash o...,0
17,Waitress was a little slow in service.,0


In [6]:
negative = ['nasty', 'rubber', 'disgusted', 'slow', 'shocked', 'sucks', 
            'disgust', 'suck', 'vomited', 'overpriced', 'undercooked', 
            'avoid', 'disappointed', 'underwhelming', 'wasted']

In [7]:
for key in negative:
    yelp[key] = yelp.Comment.str.contains(' '+ key +' ',
                                              case = False)

In [8]:
yelp

Unnamed: 0,Comment,Sentiment,nasty,rubber,disgusted,slow,shocked,sucks,disgust,suck,vomited,overpriced,undercooked,avoid,disappointed,underwhelming,wasted
0,Wow... Loved this place.,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,Crust is not good.,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,Not tasty and the texture was just nasty.,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,Stopped by during the late May bank holiday of...,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,The selection on the menu was great and so wer...,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,Now I am getting angry and I want my damn pho.,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,Honeslty it didn't taste THAT fresh.),0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7,The potatoes were like rubber and you could te...,0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
8,The fries were great too.,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9,A great touch.,1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [9]:
X = yelp[negative]
y = yelp['Sentiment']

In [10]:
from sklearn.naive_bayes import BernoulliNB
nb = BernoulliNB()

nb.fit(X, y)

y_predict = nb.predict(X)

print("Number of mislabeled comments out of {}: {}".format(X.shape[0], sum(y_predict != y)))

Number of mislabeled comments out of 1000: 474


In [11]:
print("Accuracy is ", 1 -(sum(y_predict != y))/X.shape[0])

Accuracy is  0.526


Only slightly better than guessing 50/50

In [12]:
# check how many were inaccurate as positive or negative reviews
analysis_table = pd.DataFrame(yelp['Sentiment'])
analysis_table['Prediction'] = y_predict
analysis_table.head()

Unnamed: 0,Sentiment,Prediction
0,1,1
1,0,1
2,0,1
3,1,1
4,1,1
