## movie review _classification in Nlp 

### importing important libraries 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score , classification_report , confusion_matrix
import pickle
import seaborn as sns

#### load movies review data and show top 5 rows

In [2]:
df=pd.read_csv("IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


#### perform  EDA

In [4]:
df.shape

(50000, 2)

In [5]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [8]:
df['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [9]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

#### Apply labelencoding to make target feature into numerical(positive :1,negative : 0

In [10]:
label=LabelEncoder()

In [11]:
df['sentiment']=label.fit_transform(df['sentiment'])

In [12]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


#### Divide data into independent and dependent

In [13]:
x=df['review'][0:200]
y=df['sentiment'][0:200]

#### Remove all special and numeric character from data and also remove stopwords and apply stemming

In [14]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [15]:
ps=PorterStemmer()
corpus=[]

for i in range(len(x)):
    review=re.sub("[^a-zA-Z]"," ",x[i])
    review=review.lower()
    review.split()
    review=[ps.stem(word) for word in review if word not in   set(stopwords.words("english"))]
    review= "".join(review)
    corpus.append(review)

In [16]:
corpus

['ne f he her revewer h enne h fer wchng ju   z epe u ll be hke  he re rgh   h  excl wh hppene wh e  br    br   he fr hng h ruck e bu z w  brul n unflnchng cene f vlence  whch e n rgh fr he wr g  ru e  h  n  hw fr he fn here r   h hw pull n punche wh regr  rug  ex r vlence    hrcre  n he clc ue f he wr  br    br     clle z  h  he nckne gven  he wl xu ecur e penenr   fcue nl n eerl c  n experenl ecn f he prn where ll he cell hve gl frn n fce nwr   prvc  n hgh n he gen  e c  he  n  rn  ul  gng  ln  chrn  ln  rh n re     cuffle  eh re  g elng n h greeen re never fr w  br    br    wul  he n ppel f he hw  ue  he fc h  ge where her hw wuln  re  frge pre pcure pne fr nre uence  frge chr  frge rnce   z en  e run  he fr epe  ever w ruck e   n  w urrel   culn    w re fr   bu   wche re   evelpe  e fr z  n g ccue  he hgh level f grphc vlence  n ju vlence  bu njuce  crke gur wh ll be l u fr  nckel  ne wh ll kll n rer n ge w wh   well nnere  le cl ne beng urne n prn bche ue  her lck f ree kll r prn 

#### apply tfid vectorizer to make text data into vectors

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features=5000)
X=cv.fit_transform(corpus).toarray()


In [18]:
X.shape

(200, 4016)

#### split data into train and test

In [19]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=101)

In [20]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((160, 4016), (40, 4016), (160,), (40,))

#### Define naive-bayes model

In [21]:
from sklearn.naive_bayes import MultinomialNB

In [22]:
nvb=MultinomialNB()
nvb.fit(x_train,y_train)

#### TEST MODEL USING TEST DATA

In [23]:
pred=nvb.predict(x_test)
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

#### Check accuracy_score,confusion_matrix and classification_report

In [24]:
print(accuracy_score(y_test,pred))
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

0.65
[[26  0]
 [14  0]]
              precision    recall  f1-score   support

           0       0.65      1.00      0.79        26
           1       0.00      0.00      0.00        14

    accuracy                           0.65        40
   macro avg       0.33      0.50      0.39        40
weighted avg       0.42      0.65      0.51        40



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### difference between actual and predicted data

In [25]:
pd.DataFrame(np.c_[y_test,pred],columns=['actual','predicted'])

Unnamed: 0,actual,predicted
0,0,0
1,1,0
2,1,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,1,0
9,0,0


#### save my trained naive-bayes model and TfidfVectorizer

In [26]:
pickle.dump(cv,open('count-Vectorizer.pkl','wb'))
pickle.dump(nvb,open('movies_review_classification.pkl','wb'))


#### load my naive-bayes model and tfidfVectorizer

In [32]:
save_cv=pickle.load(open('count-Vectorizer.pkl','rb'))
model=pickle.load(open('movies_review_classification.pkl','rb'))

#### define my function to test model

In [33]:
"""def test_model(sentence):
    sen=save_cv.transform([sentence]).toarray()
    res=model.predict(sen)[0]
    if res==1:
        return "positive review"
    else:
        return "negative review"""

'def test_model(sentence):\n    sen=save_cv.transform([sentence]).toarray()\n    res=model.predict(sen)[0]\n    if res==1:\n        return "positive review"\n    else:\n        return "negative review'

In [34]:
def test_model(sentence):
    sen = save_cv.transform([sentence]).toarray()
    res = model.predict(sen)[0]
    if res == 1:
        return 'positive review'
    else:
        return 'negative review'

#### test with positive review and see results

In [37]:
en ='A wonderful little productio'
res = test_model(sen)
print(res)

negative review


#### test with negative review

In [38]:
sen ='this is the worst movie, i have ever seen in my life'
res = test_model(sen)
print(res)

negative review
