In [1]:
import warnings
warnings.filterwarnings('ignore')

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics

In [3]:
imdb_train = pd.read_csv("movie_review_train.csv")
imdb_test = pd.read_csv("movie_review_test.csv")

In [4]:
imdb_train.head()

Unnamed: 0,class,text
0,Pos,a common complaint amongst film critics is ...
1,Pos,whew this film oozes energy the kind of b...
2,Pos,steven spielberg s amistad which is bas...
3,Pos,he has spent his entire life in an awful litt...
4,Pos,being that it is a foreign language film with...


In [5]:
imdb_test.head()

Unnamed: 0,class,text
0,Pos,films adapted from comic books have had plent...
1,Pos,every now and then a movie comes along from a...
2,Pos,you ve got mail works alot better than it des...
3,Pos,jaws is a rare film that grabs your atte...
4,Pos,moviemaking is a lot like being the general m...


In [6]:
imdb_train["class"].value_counts()

class
Pos    800
Neg    800
Name: count, dtype: int64

In [7]:
imdb_test["class"].value_counts()

class
Pos    200
Neg    200
Name: count, dtype: int64

In [8]:
imdb_train["label"] = imdb_train["class"].map({"Pos":1, "Neg":0})
imdb_test["label"] = imdb_test["class"].map({"Pos":1, "Neg":0})

In [9]:
imdb_train = imdb_train.drop("class", axis=1)
imdb_test = imdb_test.drop("class", axis=1)

In [10]:
imdb_train

Unnamed: 0,text,label
0,a common complaint amongst film critics is ...,1
1,whew this film oozes energy the kind of b...,1
2,steven spielberg s amistad which is bas...,1
3,he has spent his entire life in an awful litt...,1
4,being that it is a foreign language film with...,1
...,...,...
1595,if anything stigmata should be taken as...,0
1596,john boorman s zardoz is a goofy cinemati...,0
1597,the kids in the hall are an acquired taste ...,0
1598,there was a time when john carpenter was a gr...,0


In [11]:
imdb_test

Unnamed: 0,text,label
0,films adapted from comic books have had plent...,1
1,every now and then a movie comes along from a...,1
2,you ve got mail works alot better than it des...,1
3,jaws is a rare film that grabs your atte...,1
4,moviemaking is a lot like being the general m...,1
...,...,...
395,one of the first films of 1999 is this mtv pi...,0
396,susan granger s review of america s sweethe...,0
397,susan granger s review of jeepers creepers ...,0
398,this independent film written and directed by...,0


In [12]:
X_train = imdb_train.text
y_train = imdb_train.label
X_test = imdb_test.text
y_test = imdb_test.label

In [13]:
X_train

0        a common complaint amongst film critics is   ...
1        whew   this film oozes energy   the kind of b...
2        steven spielberg s   amistad     which is bas...
3        he has spent his entire life in an awful litt...
4        being that it is a foreign language film with...
                              ...                        
1595     if anything     stigmata   should be taken as...
1596     john boorman s   zardoz   is a goofy cinemati...
1597     the kids in the hall are an acquired taste   ...
1598     there was a time when john carpenter was a gr...
1599     two party guys bob their heads to haddaway s ...
Name: text, Length: 1600, dtype: object

In [20]:
X_test

0       films adapted from comic books have had plent...
1       every now and then a movie comes along from a...
2       you ve got mail works alot better than it des...
3          jaws   is a rare film that grabs your atte...
4       moviemaking is a lot like being the general m...
                             ...                        
395     one of the first films of 1999 is this mtv pi...
396     susan granger s review of   america s sweethe...
397     susan granger s review of   jeepers creepers ...
398     this independent film written and directed by...
399     come on hollywood   surprise me    stop givin...
Name: text, Length: 400, dtype: object

In [15]:
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)

In [16]:
len(vect.vocabulary_)

35858

In [17]:
vect_mod = CountVectorizer(stop_words='english', min_df=0.03, max_df=0.80)
vect_mod.fit(X_train)

In [18]:
len(vect_mod.vocabulary_)

1643

In [22]:
X_train_transform = vect_mod.transform(X_train)
X_test_transform = vect_mod.transform(X_test)

In [37]:
X_test_transform

<400x1643 sparse matrix of type '<class 'numpy.int64'>'
	with 51663 stored elements in Compressed Sparse Row format>

In [43]:
bnb = BernoulliNB()

bnb.fit(X_train_transform, y_train)
y_pred_class = bnb.predict(X_test_transform)
y_pred_proba = bnb.predict_proba(X_test_transform)

metrics.accuracy_score(y_test, y_pred_class)

0.79

In [44]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[177,  23],
       [ 61, 139]], dtype=int64)

In [46]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
print(confusion)
#[row, column]
TP = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
TN = confusion[1, 1]

[[177  23]
 [ 61 139]]
