## Step1: Cleaning

In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 
import pandas as pd
import numpy as np
import sys

#clean input file
def getCleanDocument(inputFile,outputFile):
    out=open(outputFile,'w',encoding="utf8")
    with open(inputFile,encoding="utf8") as f:
        reviews=f.readlines()
        
    for review in reviews:
        cleaned_review=getCleanText(review)
        print(cleaned_review,file=out)
    
    out.close()
    
#clean a review
def getCleanText(text):
    text=text.lower()
    text=text.replace("<br/><br/>"," ")
    
    #Init objects
    tokenizer=RegexpTokenizer(r'\w+')
    en_stopwords=set(stopwords.words('english'))
    ps=PorterStemmer()
    
    #Tokenize
    tokens=tokenizer.tokenize(text)
    new_tokens=[token for token in tokens if token not in en_stopwords]
    stemmed_tokens=[ps.stem(token) for token in new_tokens]
    cleaned_text=' '.join(stemmed_tokens)
    return cleaned_text

In [2]:
#get train dataset
getCleanDocument("../../Datasets/IMDB/imdb_trainX.txt","Xtrain.txt")

In [3]:
#get test dataset
getCleanDocument("../../Datasets/IMDB/imdb_testX.txt","Xtest.txt")

In [4]:
with open("Xtrain.txt",'r') as f:
    X_train=f.readlines()
print(type(X_train)) 
print(len(X_train))

<class 'list'>
25000


In [5]:
with open("Xtest.txt",'r') as f:
    X_test=f.readlines()
print(type(X_test)) 
print(len(X_test))

<class 'list'>
25000


In [6]:
with open("../../Datasets/IMDB/imdb_trainY.txt",'r') as f:
    Y_train=f.readlines()
print(type(Y_train)) 
Y_train=[int(i) for i in Y_train]
print(len(Y_train))

<class 'list'>
25000


In [7]:
with open("../../Datasets/IMDB/imdb_testY.txt",'r') as f:
    Y_test=f.readlines()
Y_test=[int(i) for i in Y_test]    
print(type(Y_test))
print(len(Y_test))

<class 'list'>
25000


In [8]:
print(X_train[0])
print(Y_train[0])
print(X_test[0])
print(Y_test[0])

love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far br br mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag

10
realli sure make movi weird artsi kind movi watch compel plot charact like kind movi stop watch horrif fascin thing happen screen although first time wife watch make way disturb run bit long nonetheless worthwhil view interest dark movi

7


In [9]:
X_train=np.array(X_train)
Y_train=np.array(Y_train)
print(X_train.shape,Y_train.shape)

(25000,) (25000,)


In [10]:
X_test=np.array(X_test)
Y_test=np.array(Y_test)
print(X_test.shape,Y_test.shape)

(25000,) (25000,)


## Step2: Vectorization

In [92]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(ngram_range=(1,2))
x_vec=cv.fit_transform(X_train)
print(x_vec.shape)

(25000, 1577483)


In [93]:
xt_vec=cv.transform(X_test)
print(xt_vec.shape)

(25000, 1577483)


## Step3: Train Model

In [94]:
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
mnb=MultinomialNB(alpha=0.01)
mnb.fit(x_vec,Y_train)
#predictions
yt_pred=mnb.predict(xt_vec)

In [95]:
print(yt_pred[:10])

[ 1  8  3  7  7  1 10  9 10 10]


In [96]:
#test set accuracy
mnb.score(xt_vec,Y_test)

0.36888

In [97]:
#train set accuracy
mnb.score(x_vec,Y_train)

0.99976

In [98]:
#Generate Confusion Matrix
from sklearn.metrics import confusion_matrix
cnf_matrix=confusion_matrix(Y_test,yt_pred)
print(cnf_matrix)

[[3856  312  283  260   38   66   34  173]
 [1345  203  266  262   60   47   17  102]
 [1112  221  387  424  108  117   25  147]
 [ 808  165  397  606  201  197   54  207]
 [ 287   55  156  290  396  449  136  538]
 [ 295   60  118  227  330  647  234  939]
 [ 229   39   67  123  201  450  232 1003]
 [ 487   90  115  168  273  582  389 2895]]


In [99]:
#Generate Confusion Matrix
from sklearn.metrics import confusion_matrix
cnf_matrix=confusion_matrix(Y_train,mnb.predict(x_vec))
print(cnf_matrix)

[[5100    0    0    0    0    0    0    0]
 [   1 2283    0    0    0    0    0    0]
 [   1    0 2419    0    0    0    0    0]
 [   0    0    1 2695    0    0    0    0]
 [   0    0    0    0 2496    0    0    0]
 [   0    0    0    0    0 3009    0    0]
 [   0    0    0    0    0    0 2263    0]
 [   2    0    0    0    0    0    1 4729]]


In [100]:
bnb=BernoulliNB(alpha=0)
bnb.fit(x_vec,Y_train)
#predictions
yt_pred=bnb.predict(xt_vec)
print(yt_pred[:10])

  'setting alpha = %.1e' % _ALPHA_MIN)


[10  8 10 10  7  1 10 10 10 10]


In [101]:
bnb.score(xt_vec,Y_test)

0.35216

In [102]:
bnb.score(x_vec,Y_train)

0.99984