## Review Sentiment Prediction using Naive Bayes Classifier 

Implemented Naive Bayes classfier to predict the sentiment of movie review. Naive Bayes Classifier assumes conditional independence among features. Compared the results obt with the results of Multinomial Naive Bayes Classier provided by sklearn library and achieved similar results.

In [15]:
import csv
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import matplotlib.pyplot as plt 
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB


df = pd.read_csv("C:/Users/irmri/OneDrive/Desktop/IMDB Dataset.csv") # read csv

# Preprocessing the text 
df['sentiment'] = df.sentiment.map({'negative': 0, 'positive': 1})  
df['review'] = df.review.map(lambda x: x.lower()) 
df['review'] = df['review'].str.replace('[^\w\s]','')
df['review'] = df['review'].apply(nltk.word_tokenize)
train = df.sample(frac=0.7, random_state=200)
test = df.drop(train.index)
print(test)

stop_words = set(stopwords.words('english'))
def remove_stops(row):
	my_list = row
	meaningful_words =' '.join([w for w in my_list if not w in stop_words])
	return (meaningful_words)

train['review'] = train['review'].apply(remove_stops)


                                              review  sentiment
0  [one, of, the, other, reviewers, has, mentione...          1
4  [petter, matteis, love, in, the, time, of, mon...          1
9  [if, you, like, original, gut, wrenching, laug...          1


In [2]:
# Count the frequency of the word
vect = CountVectorizer()
dtm_train = vect.fit_transform(train['review']) # create a document term matrix
freqcols = np.array(dtm_train.sum(axis = 0)).flatten()
freqrows = np.array(dtm_train.sum(axis = 1)).flatten()
total_words = np.sum(freqrows)
prob_word = freqcols/total_words


In [3]:
# Sepearte the postive reviews from the negative reviews
vect1 = CountVectorizer()
vect2 = CountVectorizer()
pos_reviews = train.loc[train['sentiment'] == 1,'review']
neg_reviews = train.loc[train['sentiment'] == 0,'review']
dtm_train_pos = vect1.fit_transform(pos_reviews)
dtm_train_neg = vect2.fit_transform(neg_reviews)


In [4]:
freqcols_pos = np.array(dtm_train_pos.sum(axis=0)).flatten()
freqrows_pos = np.array(dtm_train_pos.sum(axis=1)).flatten()
freqcols_neg = np.array(dtm_train_neg.sum(axis=0)).flatten()
freqrows_neg = np.array(dtm_train_neg.sum(axis=1)).flatten()

total_words_pos = np.sum(freqrows_pos)
total_words_neg = np.sum(freqrows_neg)

In [5]:
#Calculate the probability of positive and negative sentiments
prob_sentiment_pos = pos_reviews.size / len(train)
prob_sentiment_neg = neg_reviews.size / len(train)

prob_word_pos = freqcols_pos/ total_words_pos
feature_names_pos = np.asarray(vect1.get_feature_names())
prob_word_neg = freqcols_neg/total_words_neg
feature_names_neg = np.asarray(vect2.get_feature_names())

In [6]:
pos_df= pd.DataFrame()
neg_df = pd.DataFrame()
pos_df['word']= feature_names_pos
pos_df['probability'] = prob_word_pos
neg_df['word'] = feature_names_neg
neg_df['probability'] = prob_word_neg


In [7]:
def probSentiment_sentence(sentence):
    alpha = 0.0000001
    stop_words = set(stopwords.words('english'))
    meaningful_words = [w for w in sentence if not w in stop_words]
    pos_prob = np.log(prob_sentiment_pos)
    neg_prob = np.log(prob_sentiment_neg)

    for i in range(len(meaningful_words)):
        pos_word_prob = pos_df.loc[pos_df['word']== meaningful_words[i],"probability"].tolist()
        neg_word_prob = neg_df.loc[neg_df['word']== meaningful_words[i],"probability"].tolist()
        if(len(pos_word_prob) == 0):
            pos_prob += alpha
        else:
            pos_prob += np.log(pos_word_prob[0])
        if(len(neg_word_prob) == 0):
            pos_prob += alpha
        else:
            neg_prob += np.log(neg_word_prob[0])
    
    if(pos_prob > neg_prob):
        return 0
    else:
        return 1
    

result = []
for index, row in test.iterrows():
    result.append(probSentiment_sentence(row['review']))

def classification_rate(predicted, actual):
    count = 0
    for index in range(len(predicted)):
        if predicted[index] == actual[index]:
            count +=1
    return count/len(predicted)

print(classification_rate(result, test['sentiment'].tolist()))

0.49866666666666665


In [13]:
# Results of Naive Bayes Classifier from sklearn library 
df = pd.read_csv("C:/Users/irmri/OneDrive/Desktop/IMDB Dataset.csv") # read csv
df = df.head(10)
df['sentiment'] = df.sentiment.map({'negative': 0, 'positive': 1})  
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'],test_size=0.2)
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

mnb = MultinomialNB()
mnb.fit(X_train_dtm, y_train)
y_pred_class = mnb.predict(X_test_dtm)
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)


0.5