# Sentiment Analysis on newly released movie 
By I Nengah Dharma Pradnyandita 

## Import Library 

In [49]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import uuid
# Natural Language ToolKit Library
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
# Model classification 
from sklearn.naive_bayes import MultinomialNB
# Measure the prediction speed 
import time
# Saving the model
import pickle


In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nenga\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

## Import Dataset 

In [3]:
df = pd.read_csv('labeled_Movie_review_sentiment_dune.csv')

In [4]:
#Checking the null data 
df.isnull().sum()

review-data href               164
audience-reviews__name          20
audience-reviews__name href    170
audience-reviews__duration       0
audience-reviews__review         0
Sentiment                        0
dtype: int64

In [5]:
# 1 means Positive review 
# 2 means Negative review
# 3 means Neutral review
df['Sentiment'].value_counts()

1    407
0     79
2     24
Name: Sentiment, dtype: int64

## Stemming the data

In [6]:
# We just text review column
stem_df = PorterStemmer()

In [7]:
def stemming_df(review):
  review_bersih = (re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",review).lower()).split()
  review_bersih = [stem_df.stem(word) for word in review_bersih if not word in stopwords.words('english')]
  review_bersih = ' '.join(review_bersih)
  return review_bersih

In [8]:
dr = df
dr['review'] = dr['audience-reviews__review'].apply(stemming_df)

In [9]:
dr['review'].iloc[1]

'seen star war desert zendaya beauti never buy attempt act rest em job well though new ground broken one ok watch'

## Splitting Dataset 

In [10]:
X = dr['review'].values
y = dr['Sentiment'].values

In [12]:
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.2,random_state=2)

## Convert the text review to numerical values

In [13]:
#Converting into numerical data
vectorizer = TfidfVectorizer()
Xtrain = vectorizer.fit_transform(Xtrain)
Xtest = vectorizer.transform(Xtest)

In [14]:
print(Xtrain)

  (0, 519)	0.1486252548638906
  (0, 1513)	0.17049244489902232
  (0, 2291)	0.2601638907180995
  (0, 1965)	0.2475024982888377
  (0, 274)	0.18660781159600848
  (0, 941)	0.17753210772757216
  (0, 1410)	0.08302368475849554
  (0, 454)	0.17049244489902232
  (0, 2176)	0.37321562319201695
  (0, 765)	0.17753210772757216
  (0, 2336)	0.12100624149061341
  (0, 633)	0.07966039424603798
  (0, 1651)	0.18660781159600848
  (0, 2050)	0.13801033418600792
  (0, 133)	0.16474062156087677
  (0, 739)	0.14287343152574508
  (0, 1545)	0.19939929776270385
  (0, 1751)	0.17753210772757216
  (0, 1512)	0.14034826926769514
  (0, 420)	0.18660781159600848
  (0, 1086)	0.13008194535904974
  (0, 1315)	0.16474062156087677
  (0, 2140)	0.12837624960590022
  (0, 1101)	0.19939929776270385
  (0, 50)	0.10650905957076855
  :	:
  (406, 263)	0.1795432546612269
  (406, 36)	0.10953988717573049
  (406, 229)	0.09378296348660288
  (406, 48)	0.08871747012343034
  (406, 621)	0.1522388784875139
  (406, 1105)	0.1623466383312275
  (406, 687)	0

In [15]:
print(Xtest)

  (0, 2286)	0.20224104566705345
  (0, 1965)	0.24564767293699186
  (0, 1803)	0.19811059112043272
  (0, 1423)	0.2428639912839189
  (0, 1408)	0.3270120552673881
  (0, 1364)	0.27859295146273283
  (0, 1335)	0.2516162371407084
  (0, 1324)	0.23446519957186332
  (0, 1209)	0.2051542465512066
  (0, 1185)	0.3016208060316937
  (0, 902)	0.3016208060316937
  (0, 739)	0.28360542799744964
  (0, 459)	0.2655900499632055
  (0, 36)	0.22834798870301212
  (0, 1)	0.2655900499632055
  (1, 2369)	0.1509289004490776
  (1, 2358)	0.21652085129629656
  (1, 2192)	0.2832998804065977
  (1, 1987)	0.09847433273195447
  (1, 1924)	0.1565696211614256
  (1, 1851)	0.18985129465964504
  (1, 1489)	0.24319040793294808
  (1, 1461)	0.09758978717577456
  (1, 1334)	0.1565696211614256
  (1, 1328)	0.13651218138634197
  :	:
  (98, 45)	0.655821652008293
  (99, 1989)	1.0
  (100, 2254)	0.4285463139045196
  (100, 2186)	0.4285463139045196
  (100, 1062)	0.40105502227208095
  (100, 813)	0.3568667941084293
  (100, 631)	0.4285463139045196
  (1

## Import the Naive Bayes model 

In [21]:
mnb = MultinomialNB()
modelnb = mnb.fit(Xtrain,ytrain)
ypred1 = modelnb.predict(Xtest)

## Evaluate the model 

In [52]:
#timestep for measure speed in second 
start_ts = time.time()
#Accuracy Score on The training data
print('Accuracy Score on the training data :', accuracy_score(ytrain,modelnb.predict(Xtrain)))
#Accuracy Score on the test data
print('Accuracy Score on the test data :',accuracy_score(ytest,modelnb.predict(Xtest)))
end_ts = time.time()
# Print the speed of the prediction
print(f"Prediction speed [s]: {(end_ts-start_ts):.3f}")

Accuracy Score on the training data : 0.8088235294117647
Accuracy Score on the test data : 0.7549019607843137
Prediction speed [s]: 0.069


## Save the model

In [26]:
filename = 'trained_model_NLP_dune.sav'
pickle.dump(modelnb,open(filename,'wb'))

In [42]:
file = pickle.load(open(r'C:\Users\nenga\NLP_SNA\NLP_Test_NengahDharma_SNA\trained_model_NLP_dune.sav','rb'))

In [43]:
X[100]

'aw took three day watch entireti pain view piec garbag take origin day week'

In [48]:
print(ytest[12])
pred = file.predict(Xtest[12])
print(pred)

1
[1]
