## Step1: Cleaning

In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
import pandas as pd
import numpy as np
import sys

#clean input file
def getCleanDocument(inputFile,outputFile):
    out=open(outputFile,'w',encoding="utf8")
    with open(inputFile,encoding="utf8") as f:
        reviews=f.readlines()
        
    for review in reviews:
        cleaned_review=getCleanText(review)
        print(cleaned_review,file=out)
    
    out.close()
    
#clean a review
def getCleanText(text):
    text=text.lower()
    text=text.replace("<br /><br />"," ")
    
    #Init objects
    tokenizer=RegexpTokenizer(r'[a-z]+')
    l=WordNetLemmatizer()
    en_stopwords=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 
                  "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself',
                  'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her',
                  'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 
                  'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom',
                  'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was',
                  'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do',
                  'does', 'did', 'doing', 'a', 'an', 'the', 'until', 'while', 'of', 'at',
                  'by', 'for', 'with', 'about', 'into', 'through', 'during', 'before',
                  'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
                  'on', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 
                  'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
                  'most', 'other', 'some', 'such', 'own', 'same', 'so', 'than', 'too', 
                  'very', 's', 't', 'can', 'will', 'just', 'should', "should've", 'now',
                  'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'ma', 'shan', "shan't"]
    
    #Tokenize
    tokens=tokenizer.tokenize(text)
    new_tokens=[token for token in tokens if token not in en_stopwords]
    stemmed_tokens=[l.lemmatize(token) for token in new_tokens]
    cleaned_text=' '.join(stemmed_tokens)
    return cleaned_text

In [2]:
#get train dataset
getCleanDocument("../../Datasets/IMDB/imdb_trainX.txt","Xtrain.txt")

In [3]:
#get test dataset
getCleanDocument("../../Datasets/IMDB/imdb_testX.txt","Xtest.txt")

In [4]:
with open("Xtrain.txt",'r') as f:
    X_train=f.readlines()
print(type(X_train)) 
print(len(X_train))

<class 'list'>
25000


In [5]:
with open("Xtest.txt",'r') as f:
    X_test=f.readlines()
print(type(X_test)) 
print(len(X_test))

<class 'list'>
25000


In [6]:
with open("../../Datasets/IMDB/imdb_trainY.txt",'r') as f:
    Y_train=f.readlines()
print(type(Y_train)) 
Y_train=[int(i) for i in Y_train]
print(len(Y_train))

<class 'list'>
25000


In [7]:
with open("../../Datasets/IMDB/imdb_testY.txt",'r') as f:
    Y_test=f.readlines()
Y_test=[int(i) for i in Y_test]    
print(type(Y_test))
print(len(Y_test))

<class 'list'>
25000


In [8]:
print(X_train[0])
print(Y_train[0])
print(X_test[0])
print(Y_test[0])

loved movie since and saw opening day touching and beautiful strongly recommend seeing movie watch family far mpaa rating pg thematic element prolonged scene disastor nudity sexuality and language

10
not really sure make movie weird artsy not kind movie watch because compelling plot or character like kind movie stop watching because horrifically fascinating thing happening screen although first time wife watched couldn make way disturbing run bit long but nonetheless worthwhile viewing interested dark movie

7


In [9]:
X_train=np.array(X_train)
Y_train=np.array(Y_train)
print(X_train.shape,Y_train.shape)

(25000,) (25000,)


In [10]:
X_test=np.array(X_test)
Y_test=np.array(Y_test)
print(X_test.shape,Y_test.shape)

(25000,) (25000,)


## Step2: Vectorization

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer()
x_vec=cv.fit_transform(X_train)
print(x_vec.shape)

(25000, 65525)


In [12]:
print(X_train[1515])

superbly developed character lot funny situation full spirit absurdness and serbian mentality movie great comedy enjoyable interesting unpredictable best point film character humor story and dialog humor inner development rare serbian movie consequence characterization well motivated spontaneous and cogent also sharp intelligent and lucid movie unfortunately constructed humor devise joke and put character mouth or ordinary situation comedy burlesque farce art immortality incorporated movie little masterpiece hardly reachable



In [14]:
print(cv.vocabulary_["humor"])
print(type(cv.vocabulary_))
print(cv.get_feature_names()[20283])

27450
<class 'dict'>
faultline


## Step3: Train Model using sklearn MultinomialNB

In [15]:
xt_vec=cv.transform(X_test)
print(xt_vec.shape)

(25000, 65525)


In [16]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB(alpha=0.001)
mnb.fit(x_vec,Y_train)
print(mnb.score(x_vec,Y_train))
print(mnb.score(xt_vec,Y_test))

0.91688
0.32452


## Step4: Multinomial Naive Bayes from scratch

In [18]:
Y_train_list=pd.Series(Y_train).value_counts(sort=False)
ALL=Y_train_list.sum()
print(type(Y_train_list))
print(ALL)

<class 'pandas.core.series.Series'>
25000


In [19]:
prior_probs={}
for label,count in Y_train_list.items():
    prior_probs[label]=count/ALL
print(prior_probs)    

{1: 0.204, 2: 0.09136, 3: 0.0968, 4: 0.10784, 7: 0.09984, 8: 0.12036, 9: 0.09052, 10: 0.18928}


In [20]:
#have document ID ,word Id and count of tfidx with non zero value
docIdx,wordIdx=x_vec.nonzero()
count=x_vec.data

In [21]:
print(type(docIdx))
print(docIdx.shape)
print(docIdx[:100])

<class 'numpy.ndarray'>
(2533270,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [22]:
print(type(wordIdx))
print(wordIdx.shape)
print(wordIdx[:100])

<class 'numpy.ndarray'>
(2533270,)
[32568 51588 40234 15544 50487 45267 17852 57928 43194 46671 38413 20104
 20017 63196 51113 47100 55570  4794 58939 13759 40972 50334  1945 52676
 38368 34247 23078  1495 51128 58056 47103 26431 24603 25854 16772 50746
 38713 33721 51010 17517 56488 14359 11930 63463 19762  9109 21515 35547
 38737 20311 43421 31843 55986 57617 16675 51122 63274 37005 44058 58140
 18311 33258 39519 54875 24481  5750 59166 55440 28923 10163 14605 56701
 57614 27828  4753  3111   684 55418 14824 26252  2263 46101 38068 37830
 29242 35374 16360 23205 57008  2363 63975  7490 64507 40875 64311 35010
  3432 56929 46097 23820]


In [23]:
print(type(count))
print(count.shape)
print(count[:100])

<class 'numpy.ndarray'>
(2533270,)
[0.18986371 0.24212233 0.19032227 0.40507677 0.0906802  0.29179566
 0.16442368 0.30261658 0.24453083 0.16682429 0.29623759 0.12737802
 0.13217568 0.09727404 0.13900058 0.14564698 0.22329518 0.14080475
 0.19784931 0.11836062 0.16806098 0.12436754 0.12043495 0.12826952
 0.11117379 0.15522731 0.03762899 0.02968507 0.02331223 0.02266725
 0.04527018 0.03803986 0.07651778 0.04216138 0.06684505 0.03862413
 0.04146245 0.04913631 0.05080626 0.04628782 0.0540485  0.0519067
 0.06261511 0.02010703 0.03906417 0.06100811 0.04985506 0.05293692
 0.0292413  0.04896511 0.04369516 0.04633013 0.05626634 0.05973257
 0.04922313 0.03723575 0.02148672 0.04945875 0.02798303 0.02692777
 0.04722512 0.03106947 0.02386085 0.03046108 0.06166263 0.05732949
 0.07319381 0.06883925 0.05356885 0.05175211 0.06274144 0.05740022
 0.06224622 0.06773723 0.05684911 0.04338252 0.06313126 0.07016902
 0.06382026 0.0382744  0.04701217 0.06664807 0.04792406 0.0937302
 0.02979597 0.02377734 0.0296

In [25]:
classIdx=[]
for i in docIdx:
    classIdx.append(Y_train[i])
print(len(classIdx))
print(classIdx[:100])

2533270
[10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]


In [26]:
df=pd.DataFrame()
df['docIdx']=docIdx
df['wordIdx']=wordIdx
df['count']=count
df['classIdx']=np.array(classIdx)
print(df.shape)

(2533270, 4)


In [27]:
#Alpha value for smoothing
a = 0.001
vocab_size=len(cv.vocabulary_)

#Calculate probability of each word based on class
pb_ij = df.groupby(['classIdx','wordIdx'])
pb_j = df.groupby(['classIdx'])
Pr =  (pb_ij['count'].sum() + a) / (pb_j['count'].sum() + vocab_size)    

#Unstack series
Pr = Pr.unstack()

#Replace NaN or columns with 0 as word count with a/(count+|V|+1)

for c in np.unique(Y_train):
    Pr.loc[c,:] = Pr.loc[c,:].fillna(a/(pb_j['count'].sum()[c] + vocab_size))

#Convert to dictionary for greater speed
Pr_dict = Pr.to_dict()

Pr

wordIdx,0,1,2,3,4,5,6,7,8,9,...,65515,65516,65517,65518,65519,65520,65521,65522,65523,65524
classIdx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.126451e-06,2.408257e-06,9.415102e-09,3.599132e-07,9.415102e-09,9.415102e-09,9.415102e-09,9.451691e-07,3.599132e-07,4.257889e-06,...,1.702705e-06,9.415102e-09,9.415102e-09,5.930581e-07,9.415102e-09,9.415102e-09,9.415102e-09,1.481174e-06,1.481174e-06,1.481174e-06
2,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,...,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08,1.185038e-08
3,2.33395e-06,1.162248e-08,1.162248e-08,1.162248e-08,1.162248e-08,2.982074e-06,1.162248e-08,1.162248e-08,1.162248e-08,1.162248e-08,...,1.162248e-08,2.169683e-06,1.162248e-08,1.162248e-08,1.584023e-06,4.607615e-06,1.162248e-08,1.162248e-08,1.162248e-08,1.162248e-08
4,1.127529e-08,3.549597e-06,1.772279e-06,1.127529e-08,1.127529e-08,1.127529e-08,1.127529e-08,1.127529e-08,1.127529e-08,1.127529e-08,...,1.127529e-08,1.331588e-06,2.26532e-06,1.127529e-08,1.127529e-08,1.127529e-08,2.405236e-06,1.127529e-08,1.127529e-08,1.127529e-08
7,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,...,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08,1.148263e-08
8,1.953427e-06,2.088005e-06,1.097868e-08,1.097868e-08,1.097868e-08,1.097868e-08,1.097868e-08,1.097868e-08,1.097868e-08,1.097868e-08,...,1.097868e-08,1.097868e-08,1.097868e-08,1.097868e-08,1.097868e-08,1.097868e-08,1.097868e-08,1.097868e-08,1.097868e-08,1.097868e-08
9,1.18571e-08,1.18571e-08,1.18571e-08,1.18571e-08,1.075838e-06,1.18571e-08,1.18571e-08,1.18571e-08,1.18571e-08,2.612285e-06,...,1.18571e-08,1.18571e-08,1.18571e-08,1.18571e-08,1.18571e-08,1.18571e-08,1.18571e-08,1.18571e-08,1.18571e-08,1.18571e-08
10,9.810896e-09,2.161605e-06,9.810896e-09,9.810896e-09,9.810896e-09,9.810896e-09,9.613449e-07,9.810896e-09,9.810896e-09,9.810896e-09,...,9.810896e-09,9.810896e-09,9.810896e-09,9.810896e-09,9.810896e-09,9.810896e-09,9.810896e-09,9.810896e-09,9.810896e-09,9.810896e-09


In [33]:
#Calculate IDF 
tot = len(df['docIdx'].unique()) 
pb_ij = df.groupby(['wordIdx']) 
IDF = np.log(tot/pb_ij['docIdx'].count()) 
IDF_dict = IDF.to_dict()
#print(IDF)

In [56]:
def MNB(df, smooth = False, IDF = False):
    '''
    Multinomial Naive Bayes classifier
    :param df [Pandas Dataframe]: Dataframe of data
    :param smooth [bool]: Apply Smoothing if True
    :param IDF [bool]: Apply Inverse Document Frequency if True
    :return predict [list]: Predicted class ID
    '''
    #Using dictionaries for greater speed
    df_dict = df.to_dict()
    new_dict = {}
    prediction = []
    print(len(df_dict['docIdx']))
    #new_dict = {docIdx : {wordIdx: count},....}
    for idx in range(len(df_dict['docIdx'])):
        docIdx = df_dict['docIdx'][idx]
        wordIdx = df_dict['wordIdx'][idx]
        count = df_dict['count'][idx]
        try: 
            new_dict[docIdx][wordIdx] = count 
        except:
            new_dict[df_dict['docIdx'][idx]] = {}
            new_dict[docIdx][wordIdx] = count

    #Calculating the scores for each doc
    for docIdx in range(len(new_dict)):
        score_dict = {}
        #Creating a probability row for each class
        for classIdx in np.unique(Y_train):
            score_dict[classIdx] = 1
            #For each word:
            for wordIdx in new_dict[docIdx]:
                #Check for frequency smoothing
                #log(1+f)*log(Pr(i|j))
                if smooth: 
                    try:
                        probability=Pr_dict[wordIdx][classIdx]         
                        power = np.log(1+ new_dict[docIdx][wordIdx])     
                        #Check for IDF
                        if IDF:
                            score_dict[classIdx]+=(
                               power*np.log(
                               probability*IDF_dict[wordIdx]))
                        else:
                            score_dict[classIdx]+=power*np.log(
                                                   probability)
                    except:
                        #Missing V will have log(1+0)*log(a/16689)=0 
                        score_dict[classIdx] += 0                        
                #f*log(Pr(i|j))
                else: 
                    try:
                        probability = Pr_dict[wordIdx][classIdx]        
                        power = new_dict[docIdx][wordIdx]               
                        score_dict[classIdx]+=power*np.log(
                                           probability) 
                        #Check for IDF
                        if IDF:
                            score_dict[classIdx]+= power*np.log(
                                   probability*IDF_dict[wordIdx]) 
                    except:
                        #Missing V will have 0*log(a/16689) = 0
                        score_dict[classIdx] += 0      
            #Multiply final with pi         
            score_dict[classIdx] +=  np.log(prior_probs[classIdx])                          

        #Get class with max probabilty for the given docIdx 
        max_score = max(score_dict, key=score_dict.get)
        prediction.append(max_score)
        
    return prediction

In [58]:
y_pred=MNB(df)
print(np.sum(Y_train==y_pred)/25000)

2533270
0.6404


In [59]:
val=0
for i in range(len(y_pred)):
    if y_pred[i]==Y_train[i]:
        val+=1
print(val/25000)        

0.6404


In [60]:
#Generate Confusion Matrix of train_data
from sklearn.metrics import confusion_matrix
cnf_matrix=confusion_matrix(Y_train,y_pred)
print(cnf_matrix)

[[5090    0    0    0    0    0    0   10]
 [1599  516    1    7    2   10    0  149]
 [1370    0  843   10    1    7    0  189]
 [1049    0    0 1446    1    5    0  195]
 [ 478    0    0    2 1148   41    0  827]
 [ 381    0    0    2    2 1824    0  800]
 [ 339    0    0    1    6   71  594 1252]
 [ 179    0    0    1    1    2    0 4549]]


### for test set

In [61]:
#have document ID ,word Id and count of tfidx with non zero value
docIdx,wordIdx=xt_vec.nonzero()
count=xt_vec.data

In [62]:
classIdx=[]
for i in docIdx:
    classIdx.append(Y_test[i])
print(len(classIdx))

2443273


In [63]:
df=pd.DataFrame()
df['docIdx']=docIdx
df['wordIdx']=wordIdx
df['count']=count
df['classIdx']=np.array(classIdx)
print(df.shape)

(2443273, 4)


In [64]:
yt_pred=MNB(df)
print(np.sum(Y_test==yt_pred)/25000)

2443273
0.32336


In [66]:
cnf_matrix=confusion_matrix(Y_test,yt_pred)
print(cnf_matrix)

[[4540    3    8   36    6   21    0  408]
 [1965    4   10   32    8   17    0  266]
 [2001    4   24   42   10   50    1  409]
 [1838    3    8   80   16   75    0  615]
 [ 991    1    6   37   42  111    2 1117]
 [ 988    2   11   40   23  164    4 1618]
 [ 780    2    6   17   12  114    5 1408]
 [1565    5   10   27   17  142    8 3225]]


In [None]:
yt_pred1=MNB(df,True,False)
yt_pred2=MNB(df,False,True)
yt_pred3=MNB(df,True,True)
print(np.sum(Y_test==yt_pred1)/25000)
print(np.sum(Y_test==yt_pred2)/25000)
print(np.sum(Y_test==yt_pred3)/25000)

2443273
