## Review Sentiment Prediction using Markov Chain Model

In [86]:
import csv
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS 
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import matplotlib.pyplot as plt 
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv("C:/Users/irmri/OneDrive/Desktop/IMDB Dataset.csv") # read csv
df = df.head(5000)

# Preprocessing the text 
df['sentiment'] = df.sentiment.map({'negative': 0, 'positive': 1})  
df['review'] = df.review.map(lambda x: x.lower()) 
df['review'] = df['review'].str.replace('[^\w\s]','')

stop_words = set(stopwords.words('english'))   # removing stop words
def remove_stop_words(row):
    meaningful_words = [word for word in row if not word in stop_words]
    return (meaningful_words)

stemming = PorterStemmer()            # Stemming 
def stem_words(row):
    stemmed_words = " ".join([stemming.stem(word) for word in row])
    return (stemmed_words)

df['review'] = df['review'].apply(nltk.word_tokenize)
df['review'] = df['review'].apply(remove_stop_words)
df['review'] = df['review'].apply(stem_words)
train = df.sample(frac=0.7, random_state=200)
test = df.drop(train.index)

In [65]:
#create document term matrix for positive and negative reviews
vect1 = CountVectorizer(ngram_range=(2,2))
vect2 = CountVectorizer(ngram_range=(2,2))
pos_reviews = train.loc[train['sentiment'] == 1,'review']
neg_reviews = train.loc[train['sentiment'] == 0,'review']

dtm_train_pos = vect1.fit_transform(pos_reviews)
dtm_train_neg = vect2.fit_transform(neg_reviews)

In [66]:
# Counting the frequency of the positive and negative words 
freqcols_pos = np.array(dtm_train_pos.sum(axis=0)).flatten()
freqrows_pos = np.array(dtm_train_pos.sum(axis=1)).flatten()
freqcols_neg = np.array(dtm_train_neg.sum(axis=0)).flatten()
freqrows_neg = np.array(dtm_train_neg.sum(axis=1)).flatten()

total_bigram_pos = np.sum(freqrows_pos)
total_bigram_neg = np.sum(freqrows_neg)

In [67]:
# Calculate the probability of a bigram given a sentiment 
prob_sentiment_pos = pos_reviews.size / len(train)
prob_sentiment_neg = neg_reviews.size / len(train)

prob_bigram_pos = freqcols_pos/ total_bigram_pos
feature_names_pos = np.asarray(vect1.get_feature_names())
prob_bigram_neg = freqcols_neg/total_bigram_neg
feature_names_neg = np.asarray(vect2.get_feature_names())


array(['00 agent', '00 far', '0001 percent', ..., 'étcother commerci',
       'ís brilliant', 'ünfaith dian'], dtype='<U72')

In [68]:
pos_df= pd.DataFrame()
neg_df = pd.DataFrame()
pos_df['bigram']= feature_names_pos
pos_df['probability'] = prob_bigram_pos
neg_df['bigram'] = feature_names_neg
neg_df['probability'] = prob_bigram_neg

In [69]:
word_vect1 = CountVectorizer()
word_vect2 = CountVectorizer()

dtm_word_pos = word_vect1.fit_transform(pos_reviews)
dtm_word_neg = word_vect2.fit_transform(neg_reviews)

In [70]:
freqcols_pos_word = np.array(dtm_word_pos.sum(axis=0)).flatten()
freqrows_pos_word = np.array(dtm_word_pos.sum(axis=1)).flatten()
freqcols_neg_word = np.array(dtm_word_neg.sum(axis=0)).flatten()
freqrows_neg_word = np.array(dtm_word_neg.sum(axis=1)).flatten()

total_words_pos = np.sum(freqrows_pos_word)
total_words_neg = np.sum(freqrows_neg_word)

In [71]:
prob_word_pos = freqcols_pos_word/ total_words_pos
feature_names_word_pos = np.asarray(word_vect1.get_feature_names())
prob_word_neg = freqcols_neg_word/total_words_neg
feature_names_word_neg = np.asarray(word_vect2.get_feature_names())

In [72]:
pos_word_df= pd.DataFrame()
neg_word_df = pd.DataFrame()
pos_word_df['word']= feature_names_word_pos
pos_word_df['probability'] = prob_word_pos
neg_word_df['word'] = feature_names_word_neg
neg_word_df['probability'] = prob_word_neg

In [None]:
def probSentiment_sentence(sentence):
    alpha = 0.0000001
    stop_words = set(stopwords.words('english'))
    first_word = sentence.split(" ")[0]
    vect = CountVectorizer(ngram_range=(2,2))
    bigram_vect = vect.fit_transform([sentence])
    bigrams = vect.get_feature_names()
    
    first_word_pos_prob = pos_word_df.loc[pos_word_df['word']== first_word, "probability"].tolist()
    first_word_neg_prob = neg_word_df.loc[neg_word_df['word']== first_word, "probability"].tolist()
    
    if len(first_word_pos_prob) == 0:
        pos_prob = np.log(prob_sentiment_pos) + alpha
    else:
        pos_prob = np.log(prob_sentiment_pos) + np.log(first_word_pos_prob[0])
    
    if len(first_word_neg_prob) == 0:
        neg_prob = np.log(prob_sentiment_neg) + alpha
    else:
        neg_prob = np.log(prob_sentiment_neg) + np.log(first_word_neg_prob[0])

    for i in range(len(bigrams)):
        pos_bigram_prob = pos_df.loc[pos_df['bigram']== bigrams[i],"probability"].tolist()
        neg_bigram_prob = neg_df.loc[neg_df['bigram']== bigrams[i],"probability"].tolist()
        if(len(pos_bigram_prob) == 0):
            pos_prob += alpha
        else:
            pos_prob += np.log(pos_bigram_prob[0])
        if(len(neg_bigram_prob) == 0):
            pos_prob += alpha
        else:
            neg_prob += np.log(neg_bigram_prob[0])
    
    if(pos_prob > neg_prob):
        return 0
    else:
        return 1
    

result = []
for index, row in test.iterrows():
    result.append(probSentiment_sentence(row['review']))
    #print(row)

def classification_rate(predicted, actual):
    count = 0
    for index in range(len(predicted)):
        if predicted[index] == actual[index]:
            count +=1
    return count/len(predicted)

print(classification_rate(result, test['sentiment'].tolist())) 

In [89]:
cv=CountVectorizer(ngram_range=(1,2))
train_cv=cv.fit_transform(train['review'])
test_cv =cv.transform(test['review'])
print('Shape of train_cv::',train_cv.shape)
print('Shape of test_cv::',test_cv.shape)

Shape of train_cv:: (3500, 340817)
Shape of test_cv:: (1500, 340817)


### Random Forest Classifier

In [90]:
#Training the classifier
rfc=RandomForestClassifier(n_estimators=20,random_state=42)
rfc=rfc.fit(train_cv,train['sentiment'])
score=rfc.score(train_cv,train['sentiment'])
print('Accuracy of trained model is ::',score)

Accuracy of trained model is :: 0.9982857142857143


In [91]:
rfc=RandomForestClassifier(n_estimators=20,random_state=42)
rfc=rfc.fit(test_cv,test['sentiment'])
score=rfc.score(test_cv,test['sentiment'])
print('Accuracy of test model is ::',score)

Accuracy of test model is :: 0.9993333333333333
