In [28]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding,SimpleRNN,Bidirectional, LSTM,GRU

In [2]:
df = pd.read_excel('moviereviews.xlsx')

In [3]:
df

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...
...,...,...
1995,pos,"i like movies with albert brooks , and i reall..."
1996,pos,it might surprise some to know that joel and e...
1997,pos,the verdict : spine-chilling drama from horror...
1998,pos,i want to correct what i wrote in a former ret...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   2000 non-null   object
 1   review  1938 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [5]:
df.isnull().sum()

label      0
review    62
dtype: int64

In [6]:
df.dropna(inplace= True)

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label'] = le.fit_transform(df['label'])

In [8]:
x = df['review']
y = df['label']

In [9]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size= 0.3, random_state= 1)

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
tok = Tokenizer()
tok.fit_on_texts(xtrain)

In [11]:
vocab_len = len(tok.index_word)
vocab_len

36657

In [12]:
train_sequence = tok.texts_to_sequences(xtrain)
test_sequence = tok.texts_to_sequences(xtest)

In [13]:
doc_len = []
for doc in train_sequence:
    doc_len.append(len(doc))

np.quantile(doc_len, 0.99)

np.float64(1566.9)

In [14]:
max_len = 1566

In [15]:
train_matrix = pad_sequences(train_sequence, maxlen = max_len)
test_matrix = pad_sequences(test_sequence, maxlen = max_len)

# **RNN MODEL**

In [16]:
model = Sequential()
model.add(Embedding(vocab_len+1, 70, input_length = max_len, mask_zero= True))
model.add(SimpleRNN(64))
model.add(Dense(64,activation= 'tanh'))
model.add(Dense(1,activation= 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy')
model.fit(train_matrix, ytrain, epochs = 5, batch_size = 32)



Epoch 1/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 333ms/step - loss: 0.6978
Epoch 2/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 315ms/step - loss: 0.6616
Epoch 3/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 319ms/step - loss: 0.4950
Epoch 4/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 322ms/step - loss: 0.1011
Epoch 5/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 315ms/step - loss: 0.0107


<keras.src.callbacks.history.History at 0x20dc14dccd0>

In [17]:
y_pred = model.predict(test_matrix)
y_pred = np.where(y_pred >=0.5,1,0)
print(classification_report(ytest, y_pred))

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step
              precision    recall  f1-score   support

           0       0.51      0.49      0.50       293
           1       0.50      0.52      0.51       289

    accuracy                           0.50       582
   macro avg       0.50      0.50      0.50       582
weighted avg       0.50      0.50      0.50       582



# **Multi Layer RNN**

In [18]:
model = Sequential()
model.add(Embedding(vocab_len+1, 70, input_length = max_len, mask_zero= True))
model.add(SimpleRNN(64, return_sequences= True))
model.add(SimpleRNN(64, return_sequences= True))
model.add(SimpleRNN(64))
model.add(Dense(64,activation= 'tanh'))
model.add(Dense(1,activation= 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy')
model.fit(train_matrix, ytrain, epochs = 5, batch_size = 32)

Epoch 1/5




[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 1s/step - loss: 0.7307
Epoch 2/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1s/step - loss: 0.6995
Epoch 3/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1s/step - loss: 0.6880
Epoch 4/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1s/step - loss: 0.5934
Epoch 5/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1s/step - loss: 0.1831


<keras.src.callbacks.history.History at 0x20dc38956d0>

In [19]:
y_pred = model.predict(test_matrix)
y_pred = np.where(y_pred >=0.5,1,0)
print(classification_report(ytest, y_pred))

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 199ms/step
              precision    recall  f1-score   support

           0       0.50      0.51      0.50       293
           1       0.49      0.48      0.49       289

    accuracy                           0.49       582
   macro avg       0.49      0.49      0.49       582
weighted avg       0.49      0.49      0.49       582



# **Bi- Directional RNN**

In [26]:
model = Sequential()
model.add(Embedding(vocab_len+1, 70, input_length = max_len, mask_zero= True))
model.add(Bidirectional(SimpleRNN(64)))
model.add(Dense(64,activation= 'tanh'))
model.add(Dense(1,activation= 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy')
model.fit(train_matrix, ytrain, epochs = 5, batch_size = 32)

Epoch 1/5




[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 551ms/step - loss: 0.7091
Epoch 2/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 512ms/step - loss: 0.5859
Epoch 3/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 534ms/step - loss: 0.1827
Epoch 4/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 550ms/step - loss: 0.0242
Epoch 5/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 501ms/step - loss: 0.0019


<keras.src.callbacks.history.History at 0x20df4d3f370>

In [27]:
y_pred = model.predict(test_matrix)
y_pred = np.where(y_pred >=0.5,1,0)
print(classification_report(ytest, y_pred))

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 90ms/step
              precision    recall  f1-score   support

           0       0.53      0.47      0.50       293
           1       0.52      0.58      0.55       289

    accuracy                           0.52       582
   macro avg       0.52      0.52      0.52       582
weighted avg       0.52      0.52      0.52       582



# **LSTM**

In [23]:
model = Sequential()
model.add(Embedding(vocab_len+1, 70, input_length = max_len, mask_zero= True))
model.add(LSTM(64))
model.add(Dense(64,activation= 'tanh'))
model.add(Dense(1,activation= 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy')
model.fit(train_matrix, ytrain, epochs = 5, batch_size = 32)

Epoch 1/5




[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 533ms/step - loss: 0.6942
Epoch 2/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 473ms/step - loss: 0.6022
Epoch 3/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 473ms/step - loss: 0.1587
Epoch 4/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 468ms/step - loss: 0.0415
Epoch 5/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 476ms/step - loss: 0.0065


<keras.src.callbacks.history.History at 0x20df4ba85e0>

In [24]:
y_pred = model.predict(test_matrix)
y_pred = np.where(y_pred >=0.5,1,0)
print(classification_report(ytest, y_pred))

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 138ms/step
              precision    recall  f1-score   support

           0       0.69      0.77      0.73       293
           1       0.74      0.66      0.69       289

    accuracy                           0.71       582
   macro avg       0.72      0.71      0.71       582
weighted avg       0.72      0.71      0.71       582



# **GRU**

In [29]:
model = Sequential()
model.add(Embedding(vocab_len+1, 70, input_length = max_len, mask_zero= True))
model.add(GRU(64, return_sequences= True))
model.add(GRU(64, return_sequences= True))
model.add(GRU(64))
model.add(Dense(64,activation= 'tanh'))
model.add(Dense(1,activation= 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy')
model.fit(train_matrix, ytrain, epochs = 5, batch_size = 32)

Epoch 1/5




[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 2s/step - loss: 0.6934
Epoch 2/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 2s/step - loss: 0.5406
Epoch 3/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 2s/step - loss: 0.1081
Epoch 4/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 2s/step - loss: 0.0507
Epoch 5/5
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 2s/step - loss: 0.0223


<keras.src.callbacks.history.History at 0x20df4e03880>

In [30]:
y_pred = model.predict(test_matrix)
y_pred = np.where(y_pred >=0.5,1,0)
print(classification_report(ytest, y_pred))

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 375ms/step
              precision    recall  f1-score   support

           0       0.72      0.47      0.57       293
           1       0.60      0.82      0.70       289

    accuracy                           0.64       582
   macro avg       0.66      0.64      0.63       582
weighted avg       0.66      0.64      0.63       582

