In [51]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import BernoulliNB

# printing the overall accuracy
from sklearn import metrics

In [12]:
movie_train_df = pd.read_csv("movie_review_train.csv")
print(movie_train_df.shape)
movie_train_df.head()

(1600, 2)


Unnamed: 0,class,text
0,Pos,a common complaint amongst film critics is ...
1,Pos,whew this film oozes energy the kind of b...
2,Pos,steven spielberg s amistad which is bas...
3,Pos,he has spent his entire life in an awful litt...
4,Pos,being that it is a foreign language film with...


### Convert `Pos` to 1 and `Neg` to 0

In [13]:
movie_train_df["class"] = movie_train_df["class"].map({ "Pos":1, "Neg":0 })
print(movie_train_df["class"].value_counts())
movie_train_df.head()

1    800
0    800
Name: class, dtype: int64


Unnamed: 0,class,text
0,1,a common complaint amongst film critics is ...
1,1,whew this film oozes energy the kind of b...
2,1,steven spielberg s amistad which is bas...
3,1,he has spent his entire life in an awful litt...
4,1,being that it is a foreign language film with...


In [14]:
X_axis_train = movie_train_df["text"]
y_axis_train = movie_train_df["class"]

In [16]:
vect = CountVectorizer(stop_words="english")

In [18]:
vect.fit(X_axis_train)
print(len(vect.vocabulary_))

35858


In [19]:
# Suppose we don't want to consider those (rare) words which have appeared only in 3% of the documents, 
# or say those (extremely common ones) which have appeared in 80% of the documents.
vect = CountVectorizer(stop_words='english', min_df=.03, max_df=.8)
vect.fit(X_axis_train)
print(len(vect.vocabulary_))

1643


In [37]:
X_transform_train = vect.transform(X_axis_train)

In [21]:
movie_test_df = pd.read_csv("movie_review_test.csv")
movie_test_df.shape

(400, 2)

In [40]:
movie_test_df["class"] = movie_test_df["class"].map({ "Pos":1, "Neg":0 })

In [41]:
X_axis_test = movie_test_df["text"]
y_axis_test = movie_test_df["class"]

In [42]:
X_test_transform = vect.transform(X_axis_test)
X_test_transform.getnnz()

51663

In [43]:
y_axis_train = y_axis_train.astype("int")
y_axis_test = y_axis_test.astype("int")

In [55]:
# instantiating bernoulli NB class
bnb=BernoulliNB()

# fitting the model
bnb.fit(X_transform_train.toarray(), y_axis_train)

# also works
# bnb.fit(X_transformed.toarray(), y_train)

# predicting probability of test data
y_pred_bnb = bnb.predict(X_test_transform.toarray())
y_pred_proba_bnb = bnb.predict_proba(X_test_transform.toarray())

In [57]:
print("probability of test review belonging to class Neg is -> {:0.2f}".format(y_pred_proba_bnb[:, 0][0] * 100))
print("probability of test review belonging to class Pos is -> {:0.2f}".format(y_pred_proba_bnb[:, 1][0] * 100))

probability of test review belonging to class Neg is -> 0.95
probability of test review belonging to class Pos is -> 99.05


In [56]:
confusion_bnb = metrics.confusion_matrix(y_true=y_axis_test, y_pred=y_pred_bnb)
confusion_bnb

array([[177,  23],
       [ 61, 139]])

In [58]:
TN_bnb = confusion_bnb[0,0]
FP_bnb = confusion_bnb[0,1]
FN_bnb = confusion_bnb[1,0]
TP_bnb = confusion_bnb[1,1]

In [59]:
accuracy_bnb = float(TP_bnb + TN_bnb) / (TP_bnb + TN_bnb + FP_bnb + FN_bnb)
print("Accuracy\t{:0.2f}%".format(accuracy_bnb * 100))

sensitivity_bnb = TP_bnb / float(FN_bnb + TP_bnb)
print("Sensitivity\t{:0.2f}%".format(sensitivity_bnb * 100))

specificity_bnb = TN_bnb / float(TN_bnb + FP_bnb)
print("Specificity\t{:0.2f}%".format(specificity_bnb * 100))

precision_bnb = TP_bnb / float(TP_bnb + FP_bnb)
print("Precision\t{:0.2f}%".format(precision_bnb * 100))

Accuracy	79.00%
Sensitivity	69.50%
Specificity	88.50%
Precision	85.80%
