## Movie Review Analysis

### Importing Libraries

In [19]:
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amita\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Uploading the IMDB dataset

In [23]:
#df = pd.read_csv(r'C:\Users\amita\Downloads\IMDB-Dataset.csv')
df = pd.read_csv('C:\\Users\\amita\\Downloads\\IMDB-Dataset.csv')
df = df.sample(10000)
df = df.reset_index(drop=True)
#/content/IMDB-Dataset.csv
#df = pd.read_csv('C:\Users\amita\Downloads\IMDB-Dataset.csv' ) #, delimiter=  '\t' , quoting = 3)

In [24]:
df.head()

Unnamed: 0,review,sentiment
0,"Also known in a different form as ""House of Ex...",negative
1,Dolemite may not have been the first black exp...,positive
2,"***SPOILERS*** ***SPOILERS*** Well, seeing as ...",positive
3,10/10 for this film.<br /><br />i'm a british ...,positive
4,In Cold Blood was one of several 60s films tha...,negative


In [25]:
df.columns

Index(['review', 'sentiment'], dtype='object')

## Label Encoding:
### To convert 'positive' = 1 and 'negative' = 0.

In [26]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df["sentiment"] = label.fit_transform(df['sentiment'])

In [31]:
df.head()

Unnamed: 0,review,sentiment
0,"Also known in a different form as ""House of Ex...",0
1,Dolemite may not have been the first black exp...,1
2,"***SPOILERS*** ***SPOILERS*** Well, seeing as ...",1
3,10/10 for this film.<br /><br />i'm a british ...,1
4,In Cold Blood was one of several 60s films tha...,0


In [33]:
df["review"][0]

'Also known in a different form as "House of Exorcism," this messy<br /><br />little film takes itself so seriously as to kill any entertainment value<br /><br />whatsoever.<br /><br />The spare plot involves European tourist Elke Sommer who has a<br /><br />chance run in with Telly Savalas, who looks just like the devil she<br /><br />saw on a fresco in the square. Sommer is given a ride to a<br /><br />mysterious house in the country, where Savalas happens to be<br /><br />butler. There, she is mistaken for a long dead woman, and the real<br /><br />soap opera theatrics begin. The house\'s blind matriarch\'s<br /><br />husband had an affair with the dead woman, who was the<br /><br />matriarch\'s son\'s fiancee. The couple who gave Sommer the ride?<br /><br />Well, the woman is giving the chauffeur, uh, "back seat driving<br /><br />lessons," and the husband knows and does not care. Eventually,<br /><br />most of the cast is killed, Sommer is drugged and raped,<br /><br />escapes, an

### Removing HTML tags

In [35]:
import re
clean = re.compile('<.*?>')
re.sub(clean, '' , df.iloc[2].review)

'***SPOILERS*** ***SPOILERS*** Well, seeing as I am a major H:LOTS fan, maybe I liked the movie more than normal people would. However, this movie is still excellent. It had tons of surprises, and it gave some more closure to the series. While I was sad that Bayliss turned into a murderer, the overall feeling I felt was satisfied.'

In [37]:
df['sentiment'][0]

0

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 117.3+ KB


In [41]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [43]:
df.shape

(10000, 2)

## Cleaning the dataset

In [46]:
import string
import re
corpus = []
for i in range (10000):
    #Removing punctuation marks and numbers
    Review = re.sub(pattern ='[^a-zA-Z]' , repl = ' ' , string = df['review'][i])
    #Converting into lowercase
    Review = Review.lower()
    #Tokenization using split()
    review_word = Review.split()
    #Removal of stopwords
    review_word =  [word for word in review_word if not word in set(stopwords.words('english'))]
    #Stemming
    ps = PorterStemmer()
    review1 =  [ps.stem(word) for word in review_word]
    Review= ' '.join(review1)
    corpus.append(Review)

In [63]:
corpus[0]

'also known differ form hous exorc messi br br littl film take serious kill entertain valu br br whatsoev br br spare plot involv european tourist elk sommer br br chanc run telli savala look like devil br br saw fresco squar sommer given ride br br mysteri hous countri savala happen br br butler mistaken long dead woman real br br soap opera theatric begin hous blind matriarch br br husband affair dead woman br br matriarch son fiance coupl gave sommer ride br br well woman give chauffeur uh back seat drive br br lesson husband know care eventu br br cast kill sommer drug rape br br escap viewer taken climax board empti br br airplan must resembl empti theater br br thing play br br altern version hous exorc scene br br ad involv priest br br vh copi elit entertain crystal clear br br letterbox extra end credit delet sex br br gore scene br br mario bava direct fast furiou screenplay br br aw half bake idea abandon plotlin br br stun conveni noth propel thing br br sort forward direct

## Vectorization: TfidfVectorizer

In [66]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()

In [67]:
X.shape

(10000, 5000)

In [68]:
y = df.iloc[:,-1].values

## Train - Test Split

In [72]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

In [73]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2 , random_state = 104)

## Naive Bayes

In [77]:
from sklearn.naive_bayes import GaussianNB , MultinomialNB , BernoulliNB

#Training GaussainNB
clf1 = GaussianNB()
clf1.fit(X_train , y_train)
y_pred_GNB = clf1.predict(X_test)

#Training MultinomialNB
clf2 = MultinomialNB()
clf2.fit(X_train,y_train)
y_pred_MNB = clf2.predict(X_test)

#Training  BernoulliNB
clf3 = BernoulliNB()
clf3.fit(X_train , y_train)
y_pred_BNB = clf3.predict(X_test)

# Accuracy Score and confusion matrix: Gaussian Naive Bayes 
print("Gaussian NB " , accuracy_score(y_test,y_pred_GNB))
cm_GNB = confusion_matrix(y_test,y_pred_GNB)
print(cm_GNB)

# Accuracy Score and confusion matrix: Multinommial Naive Bayes 
print("Multinomail NB ", accuracy_score(y_test,y_pred_MNB))
cm_MNB = confusion_matrix(y_test , y_pred_MNB)
print(cm_MNB)

# Accuracy Score and confusion matrix: Bernoulli Naive Bayes
print("Bernoulli NB ",accuracy_score(y_test,y_pred_BNB))
cm_BNB = confusion_matrix (y_test , y_pred_BNB)
print(cm_BNB)

Gaussian NB  0.744
[[779 218]
 [294 709]]
Multinomail NB  0.85
[[837 160]
 [140 863]]
Bernoulli NB  0.845
[[847 150]
 [160 843]]


## Random Forest

In [78]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf = RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred_RF = rf.predict(X_test)

# Accuracy Score and confusion matrix: Random Forest
print("RadomForestClassifier ",accuracy_score(y_test , y_pred_RF ))
cm_RandomForest = confusion_matrix(y_test , y_pred_RF)
print(cm_RandomForest)

RadomForestClassifier  0.8415
[[841 156]
 [161 842]]


## Logistic Regression

In [80]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter = 1000)
model.fit(X_train , y_train )
y_pred_LogisticReg  = model.predict(X_test)

# Accuracy Score and confusion matrix: Logistic Regression
print("Logistic Regression " , accuracy_score (y_test, y_pred_LogisticReg))
cm_logisticReg = confusion_matrix(y_test , y_pred_LogisticReg)
print(cm_logisticReg)

Logistic Regression  0.8665
[[843 154]
 [113 890]]


### Comparing performance

In [83]:
print("Comparision between different algorithms: ")
print("Gaussian NB " , accuracy_score(y_test,y_pred_GNB))
print("Multinomail NB ", accuracy_score(y_test,y_pred_MNB))
print("Bernoulli NB ",accuracy_score(y_test,y_pred_BNB))
print("RadomForestClassifier ",accuracy_score(y_test , y_pred_RF ))
print("Logistic Regression ",accuracy_score(y_test , y_pred_LogisticReg))

Comparision between different algorithms: 
Gaussian NB  0.744
Multinomail NB  0.85
Bernoulli NB  0.845
RadomForestClassifier  0.8415
Logistic Regression  0.8665
