In [4]:
import pandas as pd
from sklearn.model_selection  import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix as sk_confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_columns=200
pd.options.display.max_columns=300

In [2]:
df_train = pd.read_csv('./movie_review_train.csv')
df_test = pd.read_csv('./movie_review_test.csv')

In [12]:
df_train.shape, df_test.shape
df_train['class'].value_counts()
df_test['class'].value_counts()

((1600, 3), (400, 3))

Pos    800
Neg    800
Name: class, dtype: int64

Pos    200
Neg    200
Name: class, dtype: int64

In [10]:
df_train['label'] = df_train['class'].map({'Pos': 1, 'Neg': 0})
df_test['label'] = df_test['class'].map({'Pos': 1, 'Neg': 0})

In [11]:
df_train.head(1)
df_test.head(1)

Unnamed: 0,class,text,label
0,Pos,a common complaint amongst film critics is ...,1


Unnamed: 0,class,text,label
0,Pos,films adapted from comic books have had plent...,1


In [13]:
vect = CountVectorizer(stop_words='english')
vect.fit(df_train['text'])

In [14]:
len(vect.vocabulary_)
vect.vocabulary_

35858

{'common': 6284,
 'complaint': 6360,
 'film': 11832,
 'critics': 7378,
 'aren': 1810,
 'literate': 18603,
 'scripts': 27918,
 'available': 2319,
 'quiz': 25250,
 'gives': 13262,
 'signs': 28809,
 'hope': 15074,
 'art': 1928,
 'writing': 35521,
 'isn': 16771,
 'dead': 7904,
 'hollywood': 14963,
 'need': 21281,
 'look': 18770,
 'independent': 15988,
 'films': 11851,
 'thoughtful': 32060,
 'content': 6771,
 'paul': 23045,
 'attanasio': 2173,
 'script': 27912,
 'takes': 31486,
 'tepid': 31839,
 'thriller': 32090,
 'scandals': 27629,
 'late': 18017,
 '50s': 313,
 'delivers': 8215,
 'telling': 31765,
 'parable': 22815,
 'emptiness': 10340,
 'post': 24178,
 'war': 34707,
 'american': 1334,
 'dream': 9554,
 'golden': 13449,
 'bubble': 4285,
 'surrounds': 31156,
 'protects': 24849,
 'tv': 33056,
 'networks': 21378,
 'sponsors': 29948,
 'riddled': 26743,
 'symbols': 31370,
 '58': 328,
 'chrysler': 5614,
 'radio': 25314,
 'announcement': 1534,
 'sputnik': 30053,
 'heavy': 14532,
 'handed': 14185,

In [15]:
vect = CountVectorizer(stop_words='english', min_df=.03, max_df=.8)
vect.fit(df_train['text'])

CountVectorizer(max_df=0.8, min_df=0.03, stop_words='english')

In [16]:
len(vect.vocabulary_)
vect.vocabulary_

1643

{'common': 264,
 'critics': 323,
 'aren': 78,
 'available': 101,
 'gives': 618,
 'hope': 693,
 'art': 81,
 'writing': 1632,
 'isn': 753,
 'dead': 342,
 'hollywood': 690,
 'need': 970,
 'look': 853,
 'films': 549,
 'content': 287,
 'paul': 1037,
 'script': 1248,
 'takes': 1429,
 'thriller': 1471,
 'late': 805,
 'delivers': 357,
 'telling': 1449,
 'post': 1089,
 'war': 1571,
 'american': 59,
 'dream': 413,
 'tv': 1512,
 'radio': 1145,
 'heavy': 673,
 'direction': 386,
 'robert': 1205,
 'performances': 1043,
 'john': 766,
 'rob': 1204,
 'perfectly': 1041,
 'usually': 1537,
 'quality': 1136,
 'sets': 1272,
 'camera': 192,
 'work': 1618,
 'recent': 1165,
 'century': 213,
 'period': 1044,
 'pieces': 1056,
 'years': 1638,
 'old': 1001,
 'images': 716,
 'true': 1502,
 'era': 464,
 'generation': 605,
 'gone': 625,
 '15': 4,
 'world': 1623,
 'themes': 1461,
 'good': 626,
 'life': 831,
 'family': 508,
 'match': 894,
 'father': 521,
 'fame': 506,
 'audience': 99,
 'appear': 72,
 'familiar': 507,
 

In [17]:
vect.transform(df_test['text'])

<400x1643 sparse matrix of type '<class 'numpy.int64'>'
	with 51663 stored elements in Compressed Sparse Row format>

In [19]:
X_train_transformed = vect.transform(df_train['text'])
X_test_tranformed = vect.transform(df_test['text'])

In [20]:
# instantiate bernoulli NB object
bnb = BernoulliNB()

# fit 
bnb.fit(X_train_transformed, df_train['label'])

# predict class
y_pred_class = bnb.predict(X_test_tranformed)

# predict probability
y_pred_proba = bnb.predict_proba(X_test_tranformed)

# accuracy
from sklearn import metrics
metrics.accuracy_score(df_test['label'], y_pred_class)

BernoulliNB()

0.79

In [21]:
metrics.confusion_matrix(df_test['label'], y_pred_class)

array([[177,  23],
       [ 61, 139]])