### Check directory and change the path

In [1]:
import os
import sys

In [2]:
%pwd

'/home/cdot/PycharmProjects/IMDB_Movie_talk/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/home/cdot/PycharmProjects/IMDB_Movie_talk'

### Import libraries and load the dataset

In [5]:
import os
import sys
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
from nltk.corpus import stopwords

In [6]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/cdot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/cdot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
imdb_df = pd.read_csv("IMDBDataset.csv")
df = imdb_df

### Analysing the data

In [8]:
df.shape

(50000, 2)

In [9]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [10]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [11]:
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [13]:
feature = df["review"]
label = df["sentiment"]

print(type(feature))
print(feature[0:3])

<class 'pandas.core.series.Series'>
0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
Name: review, dtype: object


In [14]:
label.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

### Preprocessing the data

#### Stemming

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
le = LabelEncoder()
label = le.fit_transform(label)

In [17]:
label[1:10]

array([1, 1, 0, 1, 1, 1, 0, 0, 1])

In [18]:
import re
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [21]:
ps = PorterStemmer()


In [22]:
len(stopwords.words('english'))

179

In [23]:
spwd = []
need = ["not","no"]
for w in stopwords.words('english'):
    if w not in need:
        spwd.append(w)

print(len(spwd))

177


In [24]:
corpus = []
for row in feature:
    review = str(sent_tokenize(row))
    # review = re.sub('[^a-zA-Z0-9]', " ", sent)
    review = word_tokenize(review)
    # print(review)
    # print()
    review = [w.lower() for w in review if w.isalpha()]
    # print(review)
    # print()
    review = [ps.stem(w) for w in review if w not in stopwords.words('english')]
    # print(review)
    # print()
    review = " ".join(review)
    corpus.append(review)
    
        

#### Lemmatizatoin

In [25]:
from nltk.stem import WordNetLemmatizer

In [26]:
wnl = WordNetLemmatizer()

### Bag of Words

#### Sample Corpus with 3 records

In [27]:
corpus1 = []
for row in feature[0:3]:
    review = str(sent_tokenize(row))
    # review = re.sub('[^a-zA-Z0-9]', " ", sent)
    review = word_tokenize(review)
    # print(review)
    # print()
    review = [w.lower() for w in review if w.isalpha()]
    # print(review)
    # print()
    review = [ps.stem(w) for w in review if w not in stopwords.words('english')]
    # print(review)
    # print()
    review = " ".join(review)
    corpus1.append(review)
    

In [28]:
print(feature[0])
print(corpus1[0])

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

In [48]:
feature1 = corpus
print(type(feature))
print(type(feature1))

<class 'list'>
<class 'list'>


In [30]:
cv1 = CountVectorizer(binary = True)
x1 = cv1.fit(corpus1)
feature_name1 = cv1.get_feature_names_out()
print(x1.vocabulary_)



{'one': 142, 'review': 174, 'mention': 132, 'watch': 237, 'oz': 146, 'episod': 58, 'hook': 99, 'right': 175, 'exactli': 62, 'happen': 94, 'br': 15, 'first': 73, 'thing': 221, 'struck': 208, 'brutal': 17, 'unflinch': 231, 'scene': 181, 'violenc': 235, 'set': 189, 'word': 247, 'go': 85, 'show': 194, 'faint': 68, 'heart': 96, 'timid': 224, 'pull': 164, 'punch': 165, 'regard': 172, 'drug': 51, 'sex': 190, 'hardcor': 95, 'classic': 26, 'use': 233, 'call': 18, 'nicknam': 141, 'given': 83, 'oswald': 145, 'maximum': 130, 'secur': 185, 'state': 205, 'penitentari': 150, 'focus': 75, 'mainli': 123, 'emerald': 55, 'citi': 24, 'experiment': 64, 'section': 184, 'prison': 160, 'cell': 20, 'glass': 84, 'front': 78, 'face': 66, 'inward': 106, 'privaci': 161, 'high': 97, 'agenda': 3, 'em': 54, 'home': 98, 'mani': 126, 'aryan': 9, 'muslim': 137, 'gangsta': 80, 'latino': 116, 'christian': 23, 'italian': 108, 'irish': 107, 'scuffl': 182, 'death': 39, 'stare': 204, 'dodgi': 49, 'deal': 38, 'shadi': 192, 'ag

In [31]:
voc = cv1.transform(corpus1)
vect = voc.toarray()
print(vect)

[[1 0 0 1 1 0 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 0 1 0 0 0 1 1 0
  1 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 0 1 0 1 1 1 0 1 1 1 0 1 0
  0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 0 0 1 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 1
  1 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1
  0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 0 1 1 1
  0 1 1 0 1 1 0 0 0 1 1 0 1 0 1 1 0 0 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 1
  0 0 0 1 0 1 0 0 1 0 1 0 0 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0
  0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0
  0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 1
  1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0
  1 0 0 0 1 1 0 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0
  0 1 0 1 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  1 1 1 0 0 1 0 0 0 0 0 1 1 0 0 0

####  Main corpus for total records

In [32]:
cv = CountVectorizer(binary=True)
cv.fit(corpus)
vect = cv.transform(corpus)

In [33]:
print("The Length of Vocabulary is: ", len(cv.get_feature_names_out()))

The Length of Vocabulary is:  66135


In [34]:
print("The Shape of Vector is: ",vect.shape)

The Shape of Vector is:  (50000, 66135)


In [35]:
cv.vocabulary_

{'one': 41528,
 'review': 48275,
 'mention': 36918,
 'watch': 63485,
 'oz': 42353,
 'episod': 18203,
 'hook': 26681,
 'right': 48527,
 'exactli': 18744,
 'happen': 25015,
 'br': 6996,
 'first': 20214,
 'thing': 58175,
 'struck': 55723,
 'brutal': 7673,
 'unflinch': 60998,
 'scene': 50559,
 'violenc': 62768,
 'set': 51591,
 'word': 64756,
 'go': 23114,
 'show': 52378,
 'faint': 19175,
 'heart': 25496,
 'timid': 58570,
 'pull': 46108,
 'punch': 46142,
 'regard': 47637,
 'drug': 16544,
 'sex': 51653,
 'hardcor': 25056,
 'classic': 10811,
 'use': 61841,
 'call': 8453,
 'nicknam': 40179,
 'given': 22911,
 'oswald': 41921,
 'maximum': 36123,
 'secur': 51195,
 'state': 55030,
 'penitentari': 43409,
 'focus': 20670,
 'mainli': 35027,
 'emerald': 17784,
 'citi': 10704,
 'experiment': 18906,
 'section': 51190,
 'prison': 45577,
 'cell': 9450,
 'glass': 22955,
 'front': 21427,
 'face': 19105,
 'inward': 28733,
 'privaci': 45587,
 'high': 26083,
 'agenda': 801,
 'em': 17728,
 'home': 26549,
 'mani

### Word2Vec

In [36]:
from gensim.models import Word2Vec

In [37]:

list_text = [s.split() for s in feature]
print(list_text[0])

['one', 'review', 'mention', 'watch', 'oz', 'episod', 'hook', 'right', 'exactli', 'happen', 'br', 'br', 'first', 'thing', 'struck', 'oz', 'brutal', 'unflinch', 'scene', 'violenc', 'set', 'right', 'word', 'go', 'show', 'faint', 'heart', 'timid', 'show', 'pull', 'punch', 'regard', 'drug', 'sex', 'violenc', 'hardcor', 'classic', 'use', 'br', 'br', 'call', 'oz', 'nicknam', 'given', 'oswald', 'maximum', 'secur', 'state', 'penitentari', 'focus', 'mainli', 'emerald', 'citi', 'experiment', 'section', 'prison', 'cell', 'glass', 'front', 'face', 'inward', 'privaci', 'high', 'agenda', 'em', 'citi', 'home', 'mani', 'aryan', 'muslim', 'gangsta', 'latino', 'christian', 'italian', 'irish', 'scuffl', 'death', 'stare', 'dodgi', 'deal', 'shadi', 'agreement', 'never', 'far', 'br', 'br', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'goe', 'show', 'would', 'dare', 'forget', 'pretti', 'pictur', 'paint', 'mainstream', 'audienc', 'forget', 'charm', 'forget', 'romanc', 'oz', 'mess', 'around', 'firs

In [38]:
cbow = Word2Vec(list_text,vector_size=300, min_count=10)
print(cbow)

Word2Vec<vocab=18561, vector_size=300, alpha=0.025>


In [39]:
cbow.wv.index_to_key[:10]

['br', 'movi', 'film', 'one', 'like', 'time', 'good', 'make', 'get', 'charact']

In [41]:
len(cbow.wv.index_to_key)

18561

In [42]:
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    
    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in cbow.wv.index_to_key]
    
    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(cbow.wv.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean

# np.mean(model[doc], axis=0)

In [44]:
temp_review = df["review"].apply(document_vector)

  wv1_mean = wv1_.mean(axis=0)
  ret = ret.dtype.type(ret / rcount)


In [50]:
len(temp_review)

50000

In [52]:
temp_review[0].shape

(300,)

In [53]:

# Combining all the document vectors into a singl numpy array (tweets_vec)
embedding_size = 300
tweets_vec = np.ones((len(temp_review), embedding_size))*np.nan
for i in range(tweets_vec.shape[0]):
    tweets_vec[i,:] = temp_review.iloc[i]

tweets_vec.shape # this itself is your final FEATURE MATRIX

(50000, 300)

In [56]:

 
# Create a new DF to store these new documnent features
df = pd.DataFrame(tweets_vec)
df['y'] = label
df.dropna(how='any', axis=0, inplace=True)

In [57]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,291,292,293,294,295,296,297,298,299,y
0,0.027198,-0.174467,0.051343,0.127409,-0.165963,-0.239659,0.053885,0.340643,-0.062303,-0.113435,...,0.002009,-0.039062,-0.057389,-0.06295,0.064735,-0.053643,-0.214199,0.126458,-0.22792,1
1,-0.080131,-0.108699,-0.151516,0.102549,-0.014205,-0.225853,-0.147408,0.36004,0.00465,-0.33168,...,0.122931,0.100293,0.0153,-0.090814,0.055899,-0.225455,-0.102607,0.135652,-0.173392,1
2,-0.072536,-0.155268,-0.068612,0.067654,-0.069609,-0.151425,-0.066818,0.323395,-0.042048,-0.209786,...,0.01223,-0.036098,-0.019296,0.018386,-0.010224,-0.111716,-0.202387,0.129005,-0.235551,1
3,0.045804,-0.152117,0.019586,0.245824,-0.21467,-0.145995,-0.077055,0.530866,-0.14845,-0.126891,...,-0.03671,-0.144301,-0.193688,-0.107201,0.002744,-0.02316,-0.000735,0.076472,-0.093292,0
4,-0.046833,-0.275197,0.027965,0.187194,-0.089729,-0.259845,-0.019544,0.258328,-0.200672,-0.197057,...,-0.024649,-0.054783,-0.060997,-0.035893,0.086026,-0.137762,-0.185741,0.120904,-0.254451,1


In [58]:
df.shape

(49987, 301)

In [59]:
X_word_emb = df.drop('y', axis=1)
y = df['y']
X_word_emb.shape

(49987, 300)

In [60]:

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4, random_state=42)
WE_pipe = Pipeline([('SC', StandardScaler()), ('LR', LR1)] )

results = cross_validate(WE_pipe, X_word_emb, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2)) 

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2)) 

80.09 0.07
79.77 0.2


### Spliting teh data into train and test sets

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [32]:
X_train, X_test, y_train, y_test = train_test_split(vect, label, test_size=0.3, random_state=123)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(35000, 66135) (15000, 66135) (35000,) (15000,)


### Logistic Regression

In [33]:
lr = LogisticRegression(max_iter=10)

In [34]:
lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:

y_pred = lr.predict(X_test)
print(confusion_matrix(y_pred, y_test))

[[6431  870]
 [1038 6661]]


In [36]:
print(accuracy_score(y_pred, y_test))

0.8728


In [37]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      7301
           1       0.88      0.87      0.87      7699

    accuracy                           0.87     15000
   macro avg       0.87      0.87      0.87     15000
weighted avg       0.87      0.87      0.87     15000



### Naive Bayes

In [43]:
nb = BernoulliNB()

In [44]:
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(confusion_matrix(y_pred,y_test))

[[6524 1440]
 [ 945 6091]]


In [45]:
print(accuracy_score(y_pred,y_test))

0.841


In [46]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.87      0.82      0.85      7964
           1       0.81      0.87      0.84      7036

    accuracy                           0.84     15000
   macro avg       0.84      0.84      0.84     15000
weighted avg       0.84      0.84      0.84     15000



### Support Vector Machine


In [63]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(confusion_matrix(y_pred, y_test))

[[6428  751]
 [1041 6780]]


In [66]:
print(accuracy_score(y_pred, y_test))

0.8805333333333333


In [67]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88      7179
           1       0.90      0.87      0.88      7821

    accuracy                           0.88     15000
   macro avg       0.88      0.88      0.88     15000
weighted avg       0.88      0.88      0.88     15000



### Decision Tree

In [50]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
print(confusion_matrix(y_pred, y_test))


[[5384 2139]
 [2085 5392]]


In [51]:
print(accuracy_score(y_pred, y_test))

0.7184


In [52]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.72      0.72      0.72      7523
           1       0.72      0.72      0.72      7477

    accuracy                           0.72     15000
   macro avg       0.72      0.72      0.72     15000
weighted avg       0.72      0.72      0.72     15000



### Random Forest

In [53]:
from sklearn.ensemble import RandomForestClassifier

In [54]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(confusion_matrix(y_pred, y_test))

[[6316 1186]
 [1153 6345]]


In [55]:
print(accuracy_score(y_pred, y_test))

0.8440666666666666


In [56]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.85      0.84      0.84      7502
           1       0.84      0.85      0.84      7498

    accuracy                           0.84     15000
   macro avg       0.84      0.84      0.84     15000
weighted avg       0.84      0.84      0.84     15000



### Unseen data Prediction

#### Unseen data processing for model

In [105]:
rw = "not go again to watch movie"
    

In [106]:
corpus1 = []
review = word_tokenize(rw)
    # print(review)
    # print()
review = [w.lower() for w in review if w.isalpha()]
    # print(review)
    # print()
review = [ps.stem(w) for w in review if w not in stopwords.words('english')]
    # print(review)
    # print()
review = " ".join(review)
corpus1.append(review)



In [107]:
corpus1

['go watch movi']

In [108]:
v = cv.transform(corpus1)
cv.get_feature_names_out()
cv.vocabulary_

{'one': 41528,
 'review': 48275,
 'mention': 36918,
 'watch': 63485,
 'oz': 42353,
 'episod': 18203,
 'hook': 26681,
 'right': 48527,
 'exactli': 18744,
 'happen': 25015,
 'br': 6996,
 'first': 20214,
 'thing': 58175,
 'struck': 55723,
 'brutal': 7673,
 'unflinch': 60998,
 'scene': 50559,
 'violenc': 62768,
 'set': 51591,
 'word': 64756,
 'go': 23114,
 'show': 52378,
 'faint': 19175,
 'heart': 25496,
 'timid': 58570,
 'pull': 46108,
 'punch': 46142,
 'regard': 47637,
 'drug': 16544,
 'sex': 51653,
 'hardcor': 25056,
 'classic': 10811,
 'use': 61841,
 'call': 8453,
 'nicknam': 40179,
 'given': 22911,
 'oswald': 41921,
 'maximum': 36123,
 'secur': 51195,
 'state': 55030,
 'penitentari': 43409,
 'focus': 20670,
 'mainli': 35027,
 'emerald': 17784,
 'citi': 10704,
 'experiment': 18906,
 'section': 51190,
 'prison': 45577,
 'cell': 9450,
 'glass': 22955,
 'front': 21427,
 'face': 19105,
 'inward': 28733,
 'privaci': 45587,
 'high': 26083,
 'agenda': 801,
 'em': 17728,
 'home': 26549,
 'mani

In [109]:
arr = v.toarray()
arr.shape

(1, 66135)

#### unseen preprocessed data for applying on the model

##### Logistic regression

In [110]:
lr.predict(arr)

array([1])

##### Naive bayes

In [111]:
nb.predict(arr)

array([0])

##### SVM

In [112]:
svm.predict(arr)

array([1])

##### Decision Tree

In [113]:
dtc.predict(arr)

array([1])

##### Random Forest

In [114]:
rfc.predict(arr)

array([1])