Impoting Required libraries

In [2]:
import os
import json
import zipfile
import pandas as pd

import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


Forming a python Dictonary to store username and key

In [8]:
kaggle_dict = json.load(open('/content/kaggle.json'))

Fetching username and key From Dictionary

In [9]:
os.environ['KAGGLE_USERNAME'] = kaggle_dict['username']
os.environ['KAGGLE_KEY'] = kaggle_dict['key']

Downloading imdb-dataset-of-50k-movie-reviews dataset from kaggle

In [10]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 97% 25.0M/25.7M [00:02<00:00, 22.0MB/s]
100% 25.7M/25.7M [00:02<00:00, 12.2MB/s]


Unzip the zipped file

In [11]:
zip_ref = zipfile.ZipFile('/content/imdb-dataset-of-50k-movie-reviews.zip')
zip_ref.extractall()

Data Loading

In [12]:
data = pd.read_csv('/content/IMDB Dataset.csv')

In [13]:
data.sample(5)

Unnamed: 0,review,sentiment
41466,TV movies generally do not receive as much rec...,positive
36507,Klatret©ªsen(Catch That Girl) is really great ...,positive
27234,Jon Good's Wife (simply one of the worst title...,negative
42761,The Falcon and the Snowman is based on a true ...,positive
46464,"Good horror movies from France are quite rare,...",positive


Data cleaning and Preprocessing

In [14]:
data.replace({'sentiment':{'negative':0,'positive':1}},inplace=True)

In [15]:
data['sentiment'].value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB


In [17]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [18]:
data.duplicated().sum()

418

In [19]:
data_cleaned = data.drop_duplicates()


In [20]:
data_cleaned.to_csv('cleaned_dataset.csv', index=False)


In [21]:
data = pd.read_csv('/content/cleaned_dataset.csv')

In [22]:
data.sample(5)

Unnamed: 0,review,sentiment
13451,Charles McDougall's resume includes directing ...,1
46166,RUN...do not walk away from this movie!!!!! Ai...,0
20205,I have to agree with most of the other posts. ...,1
2448,"Like most other people, I saw this movie on ""M...",0
31518,"Great film, a very worthy 7/10.<br /><br />Tom...",1


In [23]:
data.shape

(49582, 2)

In [24]:
data['sentiment'].value_counts()

sentiment
1    24884
0    24698
Name: count, dtype: int64

In [25]:
data.duplicated().sum()

0

Splitting Data into Train and Test Data

In [26]:
train_data,test_data = train_test_split(data,test_size=0.2,random_state=42)

Data Tokenization and applying Zero Padding

In [27]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['review'])
x_train =pad_sequences(tokenizer.texts_to_sequences(train_data['review']),maxlen=200)
x_test =pad_sequences(tokenizer.texts_to_sequences(test_data['review']),maxlen=200)

In [28]:
y_train = train_data['sentiment']
y_test = test_data['sentiment']

Model Building

In [29]:
model = Sequential()
model.add(Embedding(input_dim=5000,output_dim=128,input_length=200))
model.add(LSTM(128,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))
model.build(input_shape=(None, 200))
model.summary()




Model Compilation

In [30]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

Training the Model

In [31]:
from keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=2)
model.fit(x_train,y_train,validation_split=0.2,epochs=10,batch_size=64,callbacks=[early_stop])

Epoch 1/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 376ms/step - accuracy: 0.7218 - loss: 0.5300 - val_accuracy: 0.8364 - val_loss: 0.3924
Epoch 2/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 378ms/step - accuracy: 0.8377 - loss: 0.3753 - val_accuracy: 0.8300 - val_loss: 0.3981
Epoch 3/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 377ms/step - accuracy: 0.8517 - loss: 0.3529 - val_accuracy: 0.8690 - val_loss: 0.3213
Epoch 4/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 377ms/step - accuracy: 0.8962 - loss: 0.2559 - val_accuracy: 0.8755 - val_loss: 0.2958
Epoch 5/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 368ms/step - accuracy: 0.9086 - loss: 0.2274 - val_accuracy: 0.8835 - val_loss: 0.3022
Epoch 6/10
[1m496/496[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 373ms/step - accuracy: 0.9271 - loss: 0.1842 - val_accuracy: 0.8830 - val_loss: 0.3008
Epoc

<keras.src.callbacks.history.History at 0x7b3d58b8b640>

Evaluatiing The Model

In [32]:
loss,accuracy = model.evaluate(x_test,y_test)

[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 92ms/step - accuracy: 0.8847 - loss: 0.2947


Saving the model in .h5 format

In [34]:
model.save('sentiment_model.h5')



Saving the tokenizer

In [35]:
import pickle
with open('token_1.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)