In [365]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import cross_val_score

yelp_raw = pd.read_csv('data/yelp_labelled.txt', 
                       delimiter= '\t', 
                       header=None)

df = yelp_raw.copy()
df.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [366]:
# imdb_raw = pd.read_csv('data/imdb_labelled.txt', 
#                        delimiter= '\t', 
#                        header=None)

# df = imdb_raw.copy()
# df.head()

In [367]:
# amazon_raw = pd.read_csv('data/amazon_cells_labelled.txt', 
#                          delimiter= '\t', 
#                          header=None
#                         )
# df = amazon_raw.copy()
# df.head()

In [368]:
# set column names

df.columns = ['message', 'negative']

In [369]:
df['negative'] = (df['negative'] == 0)
# Note that if you run this cell a second time everything will become false.
# So... Don't.

In [370]:
# replace punctuation 
df['message'] = df.message.str.replace(r'[^a-zA-Z\d\s:]', '')
# make lower case
df['message'] = df['message'].str.lower()

In [371]:
# split negative messages and combine into one list
negative_words = df.message[df.negative].str.cat(sep=' ').split()

positive_words = df.message[df.negative == False].str.cat(sep=' ').split()

In [372]:
# Unique Words
print('negative:', len(np.unique(negative_words)), ' positive:', len(np.unique(positive_words)))

negative: 1397  positive: 1246


In [373]:
diff = np.setdiff1d(
        ar1=negative_words, 
                      ar2=positive_words
                     )


In [374]:
new = df.copy()

for key in diff:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    new[key] = df['message'].apply(lambda x: key in x)

In [375]:
new = new.iloc[:, 2:]

In [376]:
data = new.copy()
target = df['negative']

In [377]:
data = data.applymap(lambda x: 1 if x == True else x)
data = data.applymap(lambda x: 0 if x == False else x)

In [378]:
words_50 = data.describe().T.sort_values(by='mean', 
                                         ascending=False).head(500).index.tolist()

In [379]:
new = df.copy()

for key in words_50:
    # Note that we add spaces around the key so that we're getting the word,
    # not just pattern matching.
    new[key] = new.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [380]:
data = new.iloc[:, 2:]
target = df['negative']

In [381]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}\nAccuracy: {}%".format(
    data.shape[0],
    (target != y_pred).sum(), round((1 - (target != y_pred).sum()/data.shape[0]) * 100, 2)
))

Number of mislabeled points out of a total 1000 points : 245
Accuracy: 75.5%


In [382]:
cross_val_score(bnb, data, target, cv=5)

array([0.66 , 0.65 , 0.59 , 0.625, 0.665])