# Using Multinomial Naive Bayes algorithm for sentiment analysis

Classify a movie review as 'positive' or 'negative' using Multinomial Naive Bayes.

Dataset Source: http://nifty.stanford.edu/2016/manley-urness-movie-review-sentiment/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import math

# read in train and test files, removing newlines

f = open("/content/drive/MyDrive/MLAssign2/trainfilex.txt", "r")
trainrevs = [line.rstrip('\n') for line in f]

f = open("/content/drive/MyDrive/MLAssign2/trainfiley.txt","r")
trainlabels = [line.rstrip('\n') for line in f]

f = open("/content/drive/MyDrive/MLAssign2/testfilex.txt","r")
testrevs = [line.rstrip('\n') for line in f]

f = open("/content/drive/MyDrive/MLAssign2/testfiley.txt","r")
testlabels = [line.rstrip('\n') for line in f]

print(trainrevs[0:5])
print(trainlabels[0:5])

print("The number of training examples: ", len(trainrevs))
print("The number of test examples: ", len(testrevs))



[' serious and thoughtful  ', ' with a completely predictable plot  you  ll swear that you  ve seen it all before  even if you  ve never come within a mile of the longest yard  ', ' if there was any doubt that peter o fallon did n t have an original bone in his body  a rumor of angels should dispel it  ', ' i like my christmas movies with more elves and snow and less pimps and ho  s  ', ' a terrifically entertaining specimen of spielbergian sci-fi  ']
['1', '0', '0', '0', '1']
The number of training examples:  1349
The number of test examples:  151


###  Computing the vocabulary from the reviews in the training set

In [None]:
def build_vocab(x):
########## TO DO ##########
    
    lst = []
    vocab = []
    lst = [i for item in x for i in item.split()]
    for i in lst:
      if vocab.count(i) == 0:
        vocab.append(i)
##########
    return vocab


### Computing smoothed estimate of P(w|C)

In [None]:
# Input: number n of occurences of w in C, total length N of docs in C, smoothing parameter m, 
# size of vocabulary vsize
# Output: Smoothed estimate of P(w|C)
def smooth_estimate(n,N,m,vsize):
############# TO DO ###########
  V = vsize*m
  Deno = N+V
  Num = n+m
  estimate = Num/Deno

########
  return estimate

In [None]:
  map1 = dict()
  map0 = dict()
  total_words_1 = 0
  total_words_0 = 0
  N = 0
  for i in range(len(trainrevs)):
      temp = trainrevs[i].split()
      for word in temp:
        if trainlabels[i] == '1':
          total_words_1 = total_words_1+1
          map1[word] = 1 if word not in map1 else map1[word]+1
        else:
          total_words_0 = total_words_0+1
          map0[word] = 1 if word not in map0 else map0[word]+1

N = total_words_1+total_words_0

In [None]:
prob1 = 0
prob0 = 0
p1 = 0
p0 = 0
res = []
vocab = build_vocab(trainrevs)
V = len(vocab)
print(V)

print(smooth_estimate(map1["movie"],total_words_1,0.2,V))

print(smooth_estimate(map0["movie"],total_words_0,0.2,V))

for test in testrevs:
  
  testrevwords = test.split()
  
  prob1 = math.log(trainlabels.count('1')/len(trainlabels))
  prob0 = math.log(trainlabels.count('0')/len(trainlabels))
  m = 0.2

  for word in testrevwords:
    #print(word)
    if word in map1:
      p1 = smooth_estimate(map1[word],total_words_1,m,V)
    else:
      p1 = smooth_estimate(0,total_words_1,m,V)

    if word in map0:
      p0 = smooth_estimate(map0[word],total_words_0,m,V)
    else:
      p0 = smooth_estimate(0,total_words_0,m,V)

    prob1 += math.log(p1)
    prob0 += math.log(p0)
  
  if prob1>prob0:
    res.append('1')
  else:
    res.append('0')

prediction_acc = 0

for i in range(len(res)):
  if res[i] == testlabels[i]:
    prediction_acc = prediction_acc+1

prediction_acc = prediction_acc/len(res)

print(prediction_acc)


5456
0.006259309971261341
0.009408992332781202
0.8543046357615894


###  Using sklearn on this dataset and comparing the result

In [None]:
# importing the libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

#vectorizer = CountVectorizer(stop_words='english')
vectorizer = CountVectorizer()

In [None]:
# Create the vocabulary for our feature transformation
vectorizer.fit(trainrevs)

# Next we create the feature vectors for the training data
X_train = vectorizer.transform(trainrevs).toarray() # code to turn the training reviews into a feature vector
X_test = vectorizer.transform(testrevs).toarray() # code to turn the test reviews into a feature vector
# create the multinomial naive bayes classifier and fit it to the training data
mnb = MultinomialNB()
mnb.fit(X_train,trainlabels)

# compute the accuracy of the classifier on the test set
mnb.score(X_test,testlabels)

0.847682119205298