### Check directory and change the path

In [1]:
import os
import sys

In [2]:
%pwd

'/home/cdot/PycharmProjects/IMDB_Movie_talk/research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'/home/cdot/PycharmProjects/IMDB_Movie_talk'

### Import libraries and load the dataset

In [5]:
import os
import sys
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
from nltk.corpus import stopwords

In [6]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/cdot/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/cdot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [229]:
imdb_df = pd.read_csv("IMDBDataset.csv")
df = imdb_df.copy()

### Analysing the data

In [230]:
df.shape

(50000, 2)

In [231]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [232]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [233]:
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [234]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [235]:
feature = df["review"]
label = df["sentiment"]

print(type(feature))
print(feature[0:3])

<class 'pandas.core.series.Series'>
0    One of the other reviewers has mentioned that ...
1    A wonderful little production. <br /><br />The...
2    I thought this was a wonderful way to spend ti...
Name: review, dtype: object


In [236]:
label.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

### Preprocessing the data

#### Stemming

In [237]:
from sklearn.preprocessing import LabelEncoder

In [238]:
le = LabelEncoder()
label = le.fit_transform(label)

In [239]:
label[:10]

array([1, 1, 1, 0, 1, 1, 1, 0, 0, 1])

In [240]:
df["encoded_label"] = label
df.head(3)

Unnamed: 0,review,sentiment,encoded_label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1


In [241]:
import re
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

In [242]:
ps = PorterStemmer()


In [243]:
len(stopwords.words('english'))

179

In [244]:
spwd = []
need = ["not","no"]
for w in stopwords.words('english'):
    if w not in need:
        spwd.append(w)

print(len(spwd))

177


In [245]:
corpus = []
for row in df["review"]:
    review = re.sub('[^a-zA-Z]', " ", row)
    review = review.lower()
    review = word_tokenize(review)
    review = [ps.stem(w) for w in review if w not in stopwords.words('english')]
    review = " ".join(review)
    corpus.append(review)
    
print(corpus[:3])

['one review mention watch oz episod hook right exactli happen br br first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word br br call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away br br would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch d

In [246]:
df["stemmed_corpus"] = corpus
stem_df = df.copy()
stem_df.head(4)

Unnamed: 0,review,sentiment,encoded_label,stemmed_corpus
0,One of the other reviewers has mentioned that ...,positive,1,one review mention watch oz episod hook right ...
1,A wonderful little production. <br /><br />The...,positive,1,wonder littl product br br film techniqu unass...
2,I thought this was a wonderful way to spend ti...,positive,1,thought wonder way spend time hot summer weeke...
3,Basically there's a family where a little boy ...,negative,0,basic famili littl boy jake think zombi closet...


#### Lemmatizatoin

In [25]:
from nltk.stem import WordNetLemmatizer

In [26]:
wnl = WordNetLemmatizer()

### Bag of Words

#### Sample Corpus with 3 records

In [27]:
corpus1 = []
for row in feature[0:3]:
    review = str(sent_tokenize(row))
    # review = re.sub('[^a-zA-Z0-9]', " ", sent)
    review = word_tokenize(review)
    # print(review)
    # print()
    review = [w.lower() for w in review if w.isalpha()]
    # print(review)
    # print()
    review = [ps.stem(w) for w in review if w not in stopwords.words('english')]
    # print(review)
    # print()
    review = " ".join(review)
    corpus1.append(review)
    

In [28]:
print(feature[0])
print(corpus1[0])

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

In [48]:
feature1 = corpus
print(type(feature))
print(type(feature1))

<class 'list'>
<class 'list'>


In [30]:
cv1 = CountVectorizer(binary = True)
x1 = cv1.fit(corpus1)
feature_name1 = cv1.get_feature_names_out()
print(x1.vocabulary_)



{'one': 142, 'review': 174, 'mention': 132, 'watch': 237, 'oz': 146, 'episod': 58, 'hook': 99, 'right': 175, 'exactli': 62, 'happen': 94, 'br': 15, 'first': 73, 'thing': 221, 'struck': 208, 'brutal': 17, 'unflinch': 231, 'scene': 181, 'violenc': 235, 'set': 189, 'word': 247, 'go': 85, 'show': 194, 'faint': 68, 'heart': 96, 'timid': 224, 'pull': 164, 'punch': 165, 'regard': 172, 'drug': 51, 'sex': 190, 'hardcor': 95, 'classic': 26, 'use': 233, 'call': 18, 'nicknam': 141, 'given': 83, 'oswald': 145, 'maximum': 130, 'secur': 185, 'state': 205, 'penitentari': 150, 'focus': 75, 'mainli': 123, 'emerald': 55, 'citi': 24, 'experiment': 64, 'section': 184, 'prison': 160, 'cell': 20, 'glass': 84, 'front': 78, 'face': 66, 'inward': 106, 'privaci': 161, 'high': 97, 'agenda': 3, 'em': 54, 'home': 98, 'mani': 126, 'aryan': 9, 'muslim': 137, 'gangsta': 80, 'latino': 116, 'christian': 23, 'italian': 108, 'irish': 107, 'scuffl': 182, 'death': 39, 'stare': 204, 'dodgi': 49, 'deal': 38, 'shadi': 192, 'ag

In [31]:
voc = cv1.transform(corpus1)
vect = voc.toarray()
print(vect)

[[1 0 0 1 1 0 0 1 1 1 1 0 1 1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 0 1 0 0 0 1 1 0
  1 1 1 1 0 0 1 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 0 1 0 1 1 1 0 1 1 1 0 1 0
  0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 0 0 1 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 1
  1 0 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 0 1 1 0 0 1 1 1 1 0 1 0 1 1 1 1 1 1 1
  0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 1 1 0 1 0 0 0 0 1 0 1 1 0 1 1 1
  0 1 1 0 1 1 0 0 0 1 1 0 1 0 1 1 0 0 1 1 0 0 0 0 1 1 0 1 1 0 0 0 0 1 0 1
  0 0 0 1 0 1 0 0 1 0 1 0 0 1 1 1 0 1 1 1 0 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0
  0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0
  0 0 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 1 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 1
  1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0
  1 0 0 0 1 1 0 1 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0
  0 1 0 1 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  1 1 1 0 0 1 0 0 0 0 0 1 1 0 0 0

####  Main corpus for total records

In [88]:
cv = CountVectorizer(binary=True, max_features=100, min_df=10)
cv.fit(corpus)
vect = cv.transform(corpus)

In [89]:
print("The Length of Vocabulary is: ", len(cv.get_feature_names_out()))

The Length of Vocabulary is:  100


In [90]:
print("The Shape of Vector is: ",vect.shape)

The Shape of Vector is:  (50000, 100)


In [91]:
cv.vocabulary_

{'one': 58,
 'watch': 93,
 'right': 68,
 'br': 11,
 'first': 28,
 'thing': 83,
 'scene': 71,
 'set': 75,
 'go': 32,
 'show': 76,
 'use': 91,
 'mani': 50,
 'never': 54,
 'would': 98,
 'say': 70,
 'fact': 24,
 'around': 5,
 'ever': 22,
 'got': 34,
 'get': 30,
 'well': 95,
 'turn': 89,
 'wonder': 96,
 'littl': 41,
 'film': 26,
 'old': 57,
 'time': 87,
 'give': 31,
 'actor': 1,
 'see': 72,
 'perform': 61,
 'great': 35,
 'life': 39,
 'realli': 67,
 'come': 14,
 'play': 62,
 'everi': 23,
 'thought': 86,
 'way': 94,
 'plot': 63,
 'charact': 13,
 'even': 21,
 'point': 64,
 'still': 80,
 'love': 46,
 'year': 99,
 'interest': 37,
 'think': 84,
 'movi': 52,
 'make': 48,
 'like': 40,
 'real': 66,
 'seem': 73,
 'peopl': 60,
 'director': 18,
 'new': 55,
 'anoth': 4,
 'know': 38,
 'look': 44,
 'live': 42,
 'best': 9,
 'find': 27,
 'act': 0,
 'good': 33,
 'direct': 17,
 'cast': 12,
 'work': 97,
 'stori': 81,
 'seen': 74,
 'role': 69,
 'believ': 8,
 'back': 6,
 'quit': 65,
 'funni': 29,
 'bad': 7,
 'al

In [92]:
print(vect.toarray()[1])

[0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 1 0
 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0
 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 1 1 0 0 0]


In [95]:
vector_values = pd.DataFrame(vect.toarray())
print(vector_values.head())

   0   1   2   3   4   5   6   7   8   9   ...  90  91  92  93  94  95  96  \
0   0   0   0   0   0   1   0   0   0   0  ...   0   1   0   1   0   1   0   
1   0   1   0   0   0   0   0   0   0   0  ...   0   1   0   1   0   1   1   
2   0   0   0   0   0   0   0   0   0   0  ...   0   0   0   1   1   1   1   
3   0   0   0   0   0   0   0   0   0   0  ...   0   0   0   1   0   1   0   
4   1   0   0   0   1   0   0   0   0   1  ...   0   0   0   1   1   0   0   

   97  98  99  
0   0   1   0  
1   0   0   0  
2   0   0   1  
3   0   0   0  
4   1   0   0  

[5 rows x 100 columns]


### Word2Vec

In [247]:
from gensim.models import Word2Vec

In [248]:

list_text = [s.split() for s in stem_df["stemmed_corpus"]]
print(list_text[0])

['one', 'review', 'mention', 'watch', 'oz', 'episod', 'hook', 'right', 'exactli', 'happen', 'br', 'br', 'first', 'thing', 'struck', 'oz', 'brutal', 'unflinch', 'scene', 'violenc', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'heart', 'timid', 'show', 'pull', 'punch', 'regard', 'drug', 'sex', 'violenc', 'hardcor', 'classic', 'use', 'word', 'br', 'br', 'call', 'oz', 'nicknam', 'given', 'oswald', 'maximum', 'secur', 'state', 'penitentari', 'focus', 'mainli', 'emerald', 'citi', 'experiment', 'section', 'prison', 'cell', 'glass', 'front', 'face', 'inward', 'privaci', 'high', 'agenda', 'em', 'citi', 'home', 'mani', 'aryan', 'muslim', 'gangsta', 'latino', 'christian', 'italian', 'irish', 'scuffl', 'death', 'stare', 'dodgi', 'deal', 'shadi', 'agreement', 'never', 'far', 'away', 'br', 'br', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'goe', 'show', 'dare', 'forget', 'pretti', 'pictur', 'paint', 'mainstream', 'audienc', 'forget', 'charm', 'forget', 'romanc', 'oz', 'mess',

In [399]:
cbow = Word2Vec(list_text,vector_size=100, min_count=10, sg = 0)
print(cbow)

Word2Vec<vocab=19329, vector_size=100, alpha=0.025>


In [401]:
cbow.wv.key_to_index

{'br': 0,
 'movi': 1,
 'film': 2,
 'one': 3,
 'like': 4,
 'time': 5,
 'good': 6,
 'make': 7,
 'charact': 8,
 'see': 9,
 'get': 10,
 'watch': 11,
 'even': 12,
 'stori': 13,
 'would': 14,
 'realli': 15,
 'well': 16,
 'scene': 17,
 'look': 18,
 'show': 19,
 'much': 20,
 'end': 21,
 'bad': 22,
 'great': 23,
 'peopl': 24,
 'go': 25,
 'love': 26,
 'also': 27,
 'first': 28,
 'think': 29,
 'act': 30,
 'play': 31,
 'way': 32,
 'thing': 33,
 'made': 34,
 'could': 35,
 'know': 36,
 'say': 37,
 'seem': 38,
 'work': 39,
 'plot': 40,
 'actor': 41,
 'two': 42,
 'mani': 43,
 'seen': 44,
 'come': 45,
 'year': 46,
 'want': 47,
 'take': 48,
 'never': 49,
 'life': 50,
 'best': 51,
 'tri': 52,
 'littl': 53,
 'ever': 54,
 'man': 55,
 'better': 56,
 'give': 57,
 'still': 58,
 'find': 59,
 'perform': 60,
 'feel': 61,
 'part': 62,
 'use': 63,
 'someth': 64,
 'director': 65,
 'actual': 66,
 'back': 67,
 'lot': 68,
 'interest': 69,
 'real': 70,
 'guy': 71,
 'old': 72,
 'funni': 73,
 'cast': 74,
 'though': 75,
 '

In [402]:
cbow.wv.index_to_key[:10]

['br', 'movi', 'film', 'one', 'like', 'time', 'good', 'make', 'charact', 'see']

In [403]:
print(cbow.wv.similarity("make","build"))

0.12050498


In [404]:
print(cbow.wv.similar_by_key("good"))

[('decent', 0.7544897198677063), ('great', 0.7220576405525208), ('bad', 0.702559769153595), ('alright', 0.6370170712471008), ('okay', 0.6304665207862854), ('ok', 0.6071308851242065), ('nice', 0.5829575657844543), ('darn', 0.5818901658058167), ('cool', 0.5749438405036926), ('excel', 0.5693867802619934)]


In [276]:
len(cbow.wv.index_to_key)

19329

In [277]:
me = np.array(cbow.wv.get_vector("one"))
me.mean(axis = 0)

0.080357544

In [279]:
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    
    # doc1 contains those words of the document which are included in the vocab
    doc1 = [word for word in doc.split() if word in cbow.wv.index_to_key]
    
    wv1 = []  # this will contain the WE of all the vocab words from the doc
    for word in doc1:
        wv1.append(cbow.wv.get_vector(word))
    wv1_ = np.array(wv1)
    wv1_mean = wv1_.mean(axis=0)
    return wv1_mean

# np.mean(model[doc], axis=0)

In [280]:
temp_review = stem_df["stemmed_corpus"].apply(document_vector)

In [281]:
temp_review.shape

(50000,)

In [283]:
(np.ones((len(temp_review), 300))*np.nan).shape[0]

50000

In [286]:

# Combining all the document vectors into a singl numpy array (tweets_vec)
embedding_size = 100
review_vec = np.ones((len(temp_review), embedding_size))*np.nan
for i in range(review_vec.shape[0]):
    review_vec[i,:] = temp_review.iloc[i]

review_vec.shape # this itself is your final FEATURE MATRIX

(50000, 100)

In [287]:
review_vec[0,:].mean()

0.005928780431859195

In [288]:

 
# Create a new DF to store these new documnent features
vec_df = pd.DataFrame(review_vec)
vec_df['y'] = label
vec_df.dropna(how='any', axis=0, inplace=True)

In [289]:
vec_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,y
0,0.511467,0.176962,-0.1133,0.320372,-0.216529,-0.156066,0.435461,-0.119201,-0.264528,0.192353,...,-0.167461,-0.082759,0.082263,0.126074,-0.100811,0.050716,0.077237,0.210537,0.006237,1
1,0.399278,0.047006,0.052484,0.279662,0.026666,0.155836,0.251642,-0.128293,-0.34446,-0.034714,...,0.079432,-0.112114,-0.160047,0.017236,0.310706,0.120169,0.061814,-0.232976,0.217732,1
2,0.361898,0.03834,-0.284491,0.358288,0.006961,-0.175093,0.347266,-0.297755,-0.558613,-0.015828,...,0.129093,-0.14091,-0.151156,0.145863,-0.13923,0.067921,0.185767,0.114328,0.349683,1
3,0.481012,0.009386,-0.358875,0.300214,-0.440127,-0.212287,0.61687,-0.241234,-0.464183,0.2806,...,0.199201,-0.083379,-0.252036,-0.132226,0.075301,0.072504,-0.029489,0.219512,0.174011,0
4,0.661942,0.012906,-0.420661,0.227609,-0.079016,-0.120627,0.232194,-0.321117,-0.575652,0.069736,...,0.087338,-0.171173,0.052766,0.088402,0.186092,0.055097,0.132362,-0.241924,0.066894,1


In [292]:
vec_df.shape

(50000, 101)

In [359]:
X_word_emb = vec_df.drop('y', axis=1)
y = vec_df['y']
X_word_emb.shape
X_word_emb.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.511467,0.176962,-0.1133,0.320372,-0.216529,-0.156066,0.435461,-0.119201,-0.264528,0.192353,...,-0.292195,-0.167461,-0.082759,0.082263,0.126074,-0.100811,0.050716,0.077237,0.210537,0.006237
1,0.399278,0.047006,0.052484,0.279662,0.026666,0.155836,0.251642,-0.128293,-0.34446,-0.034714,...,-0.012104,0.079432,-0.112114,-0.160047,0.017236,0.310706,0.120169,0.061814,-0.232976,0.217732
2,0.361898,0.03834,-0.284491,0.358288,0.006961,-0.175093,0.347266,-0.297755,-0.558613,-0.015828,...,-0.209395,0.129093,-0.14091,-0.151156,0.145863,-0.13923,0.067921,0.185767,0.114328,0.349683
3,0.481012,0.009386,-0.358875,0.300214,-0.440127,-0.212287,0.61687,-0.241234,-0.464183,0.2806,...,-0.609337,0.199201,-0.083379,-0.252036,-0.132226,0.075301,0.072504,-0.029489,0.219512,0.174011
4,0.661942,0.012906,-0.420661,0.227609,-0.079016,-0.120627,0.232194,-0.321117,-0.575652,0.069736,...,-0.217747,0.087338,-0.171173,0.052766,0.088402,0.186092,0.055097,0.132362,-0.241924,0.066894


#### Pipeline

In [60]:

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [61]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
LR1 = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=0.4, random_state=42)
WE_pipe = Pipeline([('SC', StandardScaler()), ('LR', LR1)] )

results = cross_validate(WE_pipe, X_word_emb, y, cv=kfold, scoring='accuracy', return_train_score=True)

# print(results['train_score'])
print(np.round((results['train_score'].mean())*100, 2), np.round((results['train_score'].std())*100, 2)) 

# print(results['test_score'])
print(np.round((results['test_score'].mean())*100, 2), np.round((results['test_score'].std())*100, 2)) 

80.09 0.07
79.77 0.2


### Spliting teh data into train and test sets

In [294]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [300]:
X_train, X_test, y_train, y_test = train_test_split(X_word_emb, y, test_size=0.25, random_state=123)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(37500, 100) (12500, 100) (37500,) (12500,)


### Logistic Regression

In [301]:
lr = LogisticRegression(max_iter=10)

In [302]:
lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [303]:

y_pred = lr.predict(X_test)
print(confusion_matrix(y_pred, y_test))

[[5341  818]
 [ 887 5454]]


In [304]:
print(accuracy_score(y_pred, y_test))

0.8636


In [305]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.86      0.87      0.86      6159
           1       0.87      0.86      0.86      6341

    accuracy                           0.86     12500
   macro avg       0.86      0.86      0.86     12500
weighted avg       0.86      0.86      0.86     12500



### Naive Bayes

In [367]:
nb = BernoulliNB()

In [368]:
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(confusion_matrix(y_pred,y_test))

[[4887 1741]
 [1341 4531]]


In [369]:
print(accuracy_score(y_pred,y_test))

0.75344


In [370]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.78      0.74      0.76      6628
           1       0.72      0.77      0.75      5872

    accuracy                           0.75     12500
   macro avg       0.75      0.75      0.75     12500
weighted avg       0.76      0.75      0.75     12500



### Support Vector Machine


In [107]:
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(confusion_matrix(y_pred, y_test))

[[5408 1836]
 [2061 5695]]


In [108]:
print(accuracy_score(y_pred, y_test))

0.7402


In [109]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.72      0.75      0.74      7244
           1       0.76      0.73      0.75      7756

    accuracy                           0.74     15000
   macro avg       0.74      0.74      0.74     15000
weighted avg       0.74      0.74      0.74     15000



### Decision Tree

In [311]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)
print(confusion_matrix(y_pred, y_test))


[[4554 1742]
 [1674 4530]]


In [313]:
print(accuracy_score(y_pred, y_test))

0.72672


In [314]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.73      0.72      0.73      6296
           1       0.72      0.73      0.73      6204

    accuracy                           0.73     12500
   macro avg       0.73      0.73      0.73     12500
weighted avg       0.73      0.73      0.73     12500



### Random Forest

In [315]:
from sklearn.ensemble import RandomForestClassifier

In [316]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
print(confusion_matrix(y_pred, y_test))

[[5087  922]
 [1141 5350]]


In [317]:
print(accuracy_score(y_pred, y_test))

0.83496


In [318]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.82      0.85      0.83      6009
           1       0.85      0.82      0.84      6491

    accuracy                           0.83     12500
   macro avg       0.83      0.84      0.83     12500
weighted avg       0.84      0.83      0.84     12500



### CNN

In [None]:
# Create a weight matrix for the embedding layer
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

In [None]:


# Define the CNN model
model = Sequential()
model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

### Unseen data Prediction

#### Unseen data processing for model

In [435]:
rw = "good movie"
    

In [436]:
corpus1 = []
review = word_tokenize(rw)
    # print(review)
    # print()
review = [w.lower() for w in review if w.isalpha()]
    # print(review)
    # print()
review = [ps.stem(w) for w in review if w not in stopwords.words('english')]
    # print(review)
    # print()
# review = " ".join(review)
# corpus1.append(review)
print(review)


['good', 'movi']


In [437]:
corpus1

[]

In [446]:
def vec_voc(sen):
    embd = [cbow.wv[c] for c in sen if c in cbow.wv.index_to_key]
    return sum(embd)/len(embd) if embd else None


voc_vec = vec_voc(review)
print(voc_vec.shape)
reshape_voc_vec = voc_vec.reshape(1,-1)
lr.predict(reshape_voc_vec)
# reshape_voc_vec.shape



(100,)


array([1])

In [393]:
v = cbow.transform(corpus1)
cbow.get_feature_names_out()
cbow.vocabulary_

AttributeError: 'Word2Vec' object has no attribute 'transform'

In [392]:
arr = v.toarray()
arr

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

#### unseen preprocessed data for applying on the model

##### Logistic regression

In [447]:
lr.predict(reshape_voc_vec)

array([1])

##### Naive bayes

In [448]:
nb.predict(reshape_voc_vec)

array([0])

##### SVM

In [346]:
svm.predict(arr)

array([0])

##### Decision Tree

In [449]:
dtc.predict(reshape_voc_vec)

array([0])

##### Random Forest

In [450]:
rfc.predict(reshape_voc_vec)

array([1])