# Name : Nidhi Bangera
# Project : Sentiment Analysis using nlp libraries
# Supervised Learning

# Loading dataset

In [1]:
import nltk

In [3]:
import pandas as pd
dataset=pd.read_csv(r'C:\Users\HP\Downloads\movie_reviews.csv')
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Importing nlp library nltk

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize

# Cleaning Data

In [5]:
import re

In [6]:
#remove html tags
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean,'',text)

In [7]:
dataset['review']=dataset['review'].apply(clean_html)

In [8]:
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [9]:
#converting text into lower case
def toLower(text):
    return text.lower()

In [10]:
dataset['review']=dataset['review'].apply(toLower)

In [11]:
dataset.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [12]:
#remove special characters
def remove_special_char(text):
    return re.sub('[^a-zA-z0-9\s]', '', text)

In [13]:
dataset['review']=dataset['review'].apply(remove_special_char)

In [14]:
dataset.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
#converting strings into tokens
def identify_tokens(row):
    review = row['review']
    tokens = nltk.word_tokenize(review)
    return tokens
        

In [17]:
dataset['tokens']=dataset.apply(identify_tokens,axis=1)

In [18]:
dataset.head()

Unnamed: 0,review,sentiment,tokens
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production the filming tech...,positive,"[a, wonderful, little, production, the, filmin..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically theres a family where a little boy j...,negative,"[basically, theres, a, family, where, a, littl..."
4,petter matteis love in the time of money is a ...,positive,"[petter, matteis, love, in, the, time, of, mon..."


In [19]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [20]:
#lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [21]:
def lematize_list(row):
    my_list = row['tokens']
    lem_list = [lemmatizer.lemmatize(word) for word in my_list]
    return (lem_list)

In [22]:
dataset['lemmatize_words'] = dataset.apply(lematize_list, axis=1)

In [23]:
dataset.head()

Unnamed: 0,review,sentiment,tokens,lemmatize_words
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[one, of, the, other, reviewer, ha, mentioned,..."
1,a wonderful little production the filming tech...,positive,"[a, wonderful, little, production, the, filmin...","[a, wonderful, little, production, the, filmin..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[i, thought, this, wa, a, wonderful, way, to, ..."
3,basically theres a family where a little boy j...,negative,"[basically, theres, a, family, where, a, littl...","[basically, there, a, family, where, a, little..."
4,petter matteis love in the time of money is a ...,positive,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, in, the, time, of, mon..."


In [26]:
#remove stop words
stopw = set(stopwords.words("english"))                  

def remove_stops(row):
    my_list = row['lemmatize_words']
    main_words = [w for w in my_list if not w in stopw]
    return (main_words)

In [27]:
dataset['main'] = dataset.apply(remove_stops, axis=1)

In [28]:
dataset.head()

Unnamed: 0,review,sentiment,tokens,lemmatize_words,main
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione...","[one, of, the, other, reviewer, ha, mentioned,...","[one, reviewer, ha, mentioned, watching, 1, oz..."
1,a wonderful little production the filming tech...,positive,"[a, wonderful, little, production, the, filmin...","[a, wonderful, little, production, the, filmin...","[wonderful, little, production, filming, techn..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[i, thought, this, wa, a, wonderful, way, to, ...","[thought, wa, wonderful, way, spend, time, hot..."
3,basically theres a family where a little boy j...,negative,"[basically, theres, a, family, where, a, littl...","[basically, there, a, family, where, a, little...","[basically, family, little, boy, jake, think, ..."
4,petter matteis love in the time of money is a ...,positive,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,..."


In [29]:
dataset.drop(['review','tokens','lemmatize_words'],axis=1,inplace=True)

In [30]:
dataset.head()

Unnamed: 0,sentiment,main
0,positive,"[one, reviewer, ha, mentioned, watching, 1, oz..."
1,positive,"[wonderful, little, production, filming, techn..."
2,positive,"[thought, wa, wonderful, way, spend, time, hot..."
3,negative,"[basically, family, little, boy, jake, think, ..."
4,positive,"[petter, matteis, love, time, money, visually,..."


In [31]:
def join(lst):
    return " ".join(lst)

In [32]:
dataset['main']=dataset['main'].apply(join)

In [33]:
dataset.head()

Unnamed: 0,sentiment,main
0,positive,one reviewer ha mentioned watching 1 oz episod...
1,positive,wonderful little production filming technique ...
2,positive,thought wa wonderful way spend time hot summer...
3,negative,basically family little boy jake think zombie ...
4,positive,petter matteis love time money visually stunni...


In [34]:
dataset.columns=['sentiment','review']

In [43]:
dataset.head()

Unnamed: 0,sentiment,review
0,positive,one review ha mention watch 1 oz episod youll ...
1,positive,wonder littl product film techniqu veri unassu...
2,positive,thought thi wa wonder way spend time hot summe...
3,negative,basic famili littl boy jake think zombi hi clo...
4,positive,petter mattei love time money visual stun film...


In [35]:
#coverting sentiment values to binary numbers
dataset['sentiment'].replace({'positive': 1, 'negative': 0}, inplace=True)

In [36]:
dataset.head()

Unnamed: 0,sentiment,review
0,1,one reviewer ha mentioned watching 1 oz episod...
1,1,wonderful little production filming technique ...
2,1,thought wa wonderful way spend time hot summer...
3,0,basically family little boy jake think zombie ...
4,1,petter matteis love time money visually stunni...


In [37]:
dataset.shape

(50000, 2)

In [40]:
#saving final dataset in a csv file
dataset.to_csv(r'C:\Users\HP\Downloads\movie_review.csv')

# Model Building

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

In [43]:
import numpy as np

In [44]:
reviews = np.array(dataset['review'])
sentiments = np.array(dataset['sentiment'])

# build train and test datasets
train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

In [50]:
y_train=train_sentiments
y_test=test_sentiments

In [45]:
vectorizer=CountVectorizer()

In [46]:
X_train = vectorizer.fit_transform(train_reviews)
X_test = vectorizer.transform(test_reviews)

In [47]:
feature_names = vectorizer.get_feature_names()
print("Number of features: {}".format(len(feature_names )))
print("First 10 features :\n{}".format(feature_names [:10]))

Number of features: 166780
First 10 features :
['00', '000', '0000000000001', '00000001', '000001', '0001', '001', '0010', '002', '00383042']


# Applying logistic regression

In [49]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [54]:
logreg = LogisticRegression(penalty='l2', max_iter=500, C=1)
logreg.fit(X_train,y_train)

LogisticRegression(C=1, max_iter=500)

In [55]:
print("Training set score:{:.2f}".format(logreg.score(X_train,y_train)))
print("Test set score:{:.2f}".format(logreg.score(X_test,y_test)))

Training set score:1.00
Test set score:0.89


In [58]:
pred_log = logreg.predict(X_test)
confusion = confusion_matrix(y_test,pred_log)
print("Confusion Matrix:\n{}".format(confusion))

Confusion Matrix:
[[6635  855]
 [ 820 6690]]


In [60]:
scores = cross_val_score(logreg,X_train,y_train,cv=5)
print("Mean cross validation accuracy:{}".format(np.mean(scores)))

Mean cross validation accuracy:0.8800000000000001


In [70]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [71]:
precision_metric = precision_score(y_test, pred_log, average = "macro")
recall_metric = recall_score(y_test, pred_log, average = "macro")
accuracy_metric = accuracy_score(y_test, pred_log)
f1_metric = f1_score(y_test, pred_log, average = "macro")                     

In [73]:
print("Model Performance metrics:")
print("------------------------------")
print("Accuracy:",accuracy_metric)
print("Precision:",precision_metric)
print("Recall: ",recall_metric )
print("F1 Score:",f1_metric )

Model Performance metrics:
------------------------------
Accuracy: 0.8883333333333333
Precision: 0.8883433136926262
Recall:  0.8883300236978199
F1 Score: 0.8883318320168526


# Applying Multinomial Naive Bayes

In [74]:
from sklearn.naive_bayes import MultinomialNB

In [75]:
nb = MultinomialNB()
nb.fit(X_train,y_train)

MultinomialNB()

In [76]:
print("Training set score:{:.2f}".format(nb.score(X_train,y_train)))
print("Test set score:{:.2f}".format(nb.score(X_test,y_test)))

Training set score:0.92
Test set score:0.86


In [77]:
pred_nb = nb.predict(X_test)
confusion = confusion_matrix(y_test,pred_nb)
print("Confusion Matrix:\n{}".format(confusion))

Confusion Matrix:
[[6574  916]
 [1241 6269]]


In [78]:
precision_metric = precision_score(y_test, pred_nb, average = "macro")
recall_metric = recall_score(y_test, pred_nb, average = "macro")
accuracy_metric = accuracy_score(y_test, pred_nb)
f1_metric = f1_score(y_test, pred_nb, average = "macro") 

In [79]:
print("Model Performance metrics:")
print("------------------------------")
print("Accuracy:",accuracy_metric)
print("Precision:",precision_metric)
print("Recall: ",recall_metric )
print("F1 Score:",f1_metric )

Model Performance metrics:
------------------------------
Accuracy: 0.8562
Precision: 0.8568574966240448
Recall:  0.8562286332953482
F1 Score: 0.8561405220980807


# Testing the models

In [82]:
review = "the movie was great"
print(logreg.predict(vectorizer.transform([review]))[0])
print(nb.predict(vectorizer.transform([review]))[0])

1
1


In [83]:
review = "the movie was not good enough"
print(logreg.predict(vectorizer.transform([review]))[0])
print(nb.predict(vectorizer.transform([review]))[0])

0
0


In [84]:
review = "the movie was satisfactory"
print(logreg.predict(vectorizer.transform([review]))[0])
print(nb.predict(vectorizer.transform([review]))[0])

0
1
