In [27]:
import pandas as pd
docs = pd.read_csv('movie_review_train.csv')
docs.head()

Unnamed: 0,class,text
0,Pos,a common complaint amongst film critics is ...
1,Pos,whew this film oozes energy the kind of b...
2,Pos,steven spielberg s amistad which is bas...
3,Pos,he has spent his entire life in an awful litt...
4,Pos,being that it is a foreign language film with...


In [28]:
docs['class'].value_counts()

Pos    800
Neg    800
Name: class, dtype: int64

In [29]:
pos_neg=docs['class'].value_counts()
pos_neg

Pos    800
Neg    800
Name: class, dtype: int64

In [30]:
# mapping labels to 1 and 0
docs['label'] = docs['class'].map({'Pos':1, 'Neg':0})

In [31]:
docs.head()

Unnamed: 0,class,text,label
0,Pos,a common complaint amongst film critics is ...,1
1,Pos,whew this film oozes energy the kind of b...,1
2,Pos,steven spielberg s amistad which is bas...,1
3,Pos,he has spent his entire life in an awful litt...,1
4,Pos,being that it is a foreign language film with...,1


In [32]:
X = docs['text']
y = docs['label']
print(X.shape)
print(y.shape)

(1600,)
(1600,)


In [33]:
# splitting into test and train

from sklearn.model_selection  import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [34]:
X_train.head()

1145     this movie about two dysfunctional families n...
73       felix   sami bouajila     the siege     lives...
446      vampire lore and legend has always been a pop...
399      kevin smith is like a big kid    his humor is...
647      bruce lee was a bigger than life martial arti...
Name: text, dtype: object

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

# vectorising the text
vect = CountVectorizer(stop_words='english',min_df=.03, max_df=.8)

In [51]:
vect.fit(X_train)

CountVectorizer(max_df=0.8, min_df=0.03, stop_words='english')

In [52]:
vect.vocabulary_

{'movie': 940,
 'really': 1149,
 'gets': 608,
 'ground': 633,
 'despite': 360,
 'good': 622,
 'performances': 1031,
 'basically': 116,
 'cast': 199,
 'eddie': 428,
 'sean': 1236,
 'robin': 1195,
 'married': 880,
 'couple': 295,
 'luck': 862,
 'living': 840,
 'city': 235,
 'spend': 1329,
 'little': 837,
 'local': 842,
 'best': 130,
 'friend': 586,
 'wife': 1575,
 'share': 1268,
 'odd': 982,
 'relationship': 1160,
 'return': 1179,
 'world': 1604,
 'love': 856,
 'soon': 1313,
 'lead': 807,
 'new': 965,
 'like': 827,
 'high': 673,
 'reach': 1139,
 'wants': 1548,
 'baby': 108,
 'father': 518,
 'deep': 349,
 'sense': 1250,
 'leaves': 814,
 'badly': 111,
 'fear': 522,
 'lies': 823,
 'happens': 650,
 'come': 250,
 'result': 1177,
 'inevitable': 728,
 'ends': 447,
 'believes': 128,
 'months': 933,
 'reality': 1145,
 '10': 1,
 'years': 1619,
 'decade': 343,
 'solid': 1308,
 'individual': 726,
 'small': 1300,
 'role': 1198,
 'life': 824,
 'falls': 503,
 'apart': 67,
 'released': 1164,
 'comes': 2

In [53]:
vect.get_feature_names()

['000',
 '10',
 '100',
 '13',
 '15',
 '17',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '20',
 '30',
 '90',
 'ability',
 'able',
 'absolutely',
 'academy',
 'accent',
 'accept',
 'accident',
 'act',
 'acted',
 'acting',
 'action',
 'actions',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adaptation',
 'add',
 'added',
 'addition',
 'adds',
 'admit',
 'adult',
 'adults',
 'adventure',
 'affair',
 'age',
 'agent',
 'ago',
 'agree',
 'ahead',
 'air',
 'albeit',
 'alien',
 'aliens',
 'alive',
 'allen',
 'allow',
 'allowed',
 'allows',
 'amazing',
 'america',
 'american',
 'amusing',
 'angry',
 'animated',
 'animation',
 'annoying',
 'answer',
 'anthony',
 'anti',
 'apart',
 'apartment',
 'apparent',
 'apparently',
 'appeal',
 'appealing',
 'appear',
 'appearance',
 'appears',
 'appreciate',
 'approach',
 'appropriate',
 'aren',
 'army',
 'arrives',
 'art',
 'artist',
 'aside',
 'ask',
 'asked',
 'asking',
 'asks',
 'aspect',
 'aspects',
 'ass',
 'assistant',
 'atmos

In [54]:
len(vect.get_feature_names())

1624

In [55]:
# transform
X_train_transformed = vect.transform(X_train)
X_test_tranformed =vect.transform(X_test)

In [56]:
X_test_tranformed

<400x1624 sparse matrix of type '<class 'numpy.int64'>'
	with 54319 stored elements in Compressed Sparse Row format>

In [57]:
from sklearn.naive_bayes import BernoulliNB

# instantiate bernoulli NB object
bnb = BernoulliNB()

# fit 
bnb.fit(X_train_transformed,y_train)

# predict class
y_pred_class = bnb.predict(X_test_tranformed)

# predict probability
y_pred_proba =bnb.predict_proba(X_test_tranformed)

# accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)


0.7925

In [58]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[167,  37],
       [ 46, 150]], dtype=int64)