## Step1: Cleaning

In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords 
import pandas as pd
import numpy as np
import sys

#clean input file
def getCleanDocument(inputFile,outputFile):
    out=open(outputFile,'w',encoding="utf8")
    with open(inputFile,encoding="utf8") as f:
        reviews=f.readlines()
        
    for review in reviews:
        cleaned_review=getCleanText(review)
        print(cleaned_review,file=out)
    
    out.close()
    
#clean a review
def getCleanText(text):
    text=text.lower()
    text=text.replace("<br/><br/>"," ")
    
    #Init objects
    tokenizer=RegexpTokenizer(r'\w+')
    en_stopwords=set(stopwords.words('english'))
    ps=PorterStemmer()
    
    #Tokenize
    tokens=tokenizer.tokenize(text)
    new_tokens=[token for token in tokens if token not in en_stopwords]
    stemmed_tokens=[ps.stem(token) for token in new_tokens]
    cleaned_text=' '.join(stemmed_tokens)
    return cleaned_text

In [2]:
#get train dataset
getCleanDocument("../../Datasets/IMDB/imdb_trainX.txt","Xtrain.txt")

In [3]:
#get test dataset
getCleanDocument("../../Datasets/IMDB/imdb_testX.txt","Xtest.txt")

In [10]:
with open("Xtrain.txt",'r') as f:
    X_train=f.readlines()
print(type(X_train)) 
print(len(X_train))

<class 'list'>
25000


In [11]:
with open("Xtest.txt",'r') as f:
    X_test=f.readlines()
print(type(X_test)) 
print(len(X_test))

<class 'list'>
25000


In [22]:
with open("../../Datasets/IMDB/imdb_trainY.txt",'r') as f:
    Y_train=f.readlines()
print(type(Y_train)) 
Y_train=[int(i) for i in Y_train]
print(len(Y_train))

<class 'list'>
25000


In [23]:
with open("../../Datasets/IMDB/imdb_testY.txt",'r') as f:
    Y_test=f.readlines()
Y_test=[int(i) for i in Y_test]    
print(type(Y_test))
print(len(Y_test))

<class 'list'>
25000


In [24]:
print(X_train[0])
print(Y_train[0])
print(X_test[0])
print(Y_test[0])

love movi sinc 7 saw open day touch beauti strongli recommend see movi watch famili far br br mpaa rate pg 13 themat element prolong scene disastor nuditi sexual languag

10
realli sure make movi weird artsi kind movi watch compel plot charact like kind movi stop watch horrif fascin thing happen screen although first time wife watch make way disturb run bit long nonetheless worthwhil view interest dark movi

7


In [25]:
X_train=np.array(X_train)
Y_train=np.array(Y_train)
print(X_train.shape,Y_train.shape)

(25000,) (25000,)


In [26]:
X_test=np.array(X_test)
Y_test=np.array(Y_test)
print(X_test.shape,Y_test.shape)

(25000,) (25000,)


## Step2: Vectorization

In [27]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
x_vec=cv.fit_transform(X_train)
print(x_vec.shape)

(25000, 51229)


In [28]:
xt_vec=cv.transform(X_test)
print(xt_vec.shape)

(25000, 51229)


## Step3: Train Model

In [29]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
mnb.fit(x_vec,Y_train)
#predictions
yt_pred=mnb.predict(xt_vec)

In [30]:
print(yt_pred[:10])

[ 1  7  4 10  8  1 10 10  8 10]


In [34]:
#test set accuracy
mnb.score(xt_vec,Y_test)

0.38348

In [35]:
#train set accuracy
mnb.score(x_vec,Y_train)

0.6744