# 1. Data Loading and Preprocessing
- The CSV data is read into a pandas DataFrame

- The code inspects the first few rows and the distribution of sentiment labels ("positive" and "negative").
- It then converts the "positive" and "negative" sentiment labels into numerical representations (1 and 0) using one-hot encoding.
- The distribution of the numerical sentiment labels is checked.

In [2]:
import pandas as pd
import numpy as np

In [3]:
from google.colab import files

uploaded = files.upload()

Saving IMDB Dataset.csv to IMDB Dataset.csv


In [4]:
data = pd.read_csv("IMDB Dataset.csv")

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [7]:
## one hot encoding
# positive -> 1
# negative -> 0
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

  data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


In [8]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:
data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,25000
0,25000


# 2. Data Splitting and Tokenization

- The data is split into training and testing sets, and the text data is converted into a numerical format that the neural network can understand.
- It imports modules from sklearn.model_selection for splitting data and tensorflow.keras for building the neural network.
- The dataset is split into training and testing sets using train_test_split. 80% of the data is used for training, and 20% for testing.
- padding ensures that all the input sequences (the numerical representations of the movie reviews) have the same fixed length before being fed into the LSTM neural network

In [10]:
# LSTM -> LONG SHORT TERM MEMORY

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state=42)

In [12]:
train_data.shape

(40000, 2)

In [13]:
test_data.shape

(10000, 2)

In [14]:
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_data["review"])

In [15]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [16]:
X_train

array([[1935,    1, 1200, ...,  205,  351, 3856],
       [   3, 1651,  595, ...,   89,  103,    9],
       [   0,    0,    0, ...,    2,  710,   62],
       ...,
       [   0,    0,    0, ..., 1641,    2,  603],
       [   0,    0,    0, ...,  245,  103,  125],
       [   0,    0,    0, ...,   70,   73, 2062]], dtype=int32)

In [17]:
X_test

array([[   0,    0,    0, ...,  995,  719,  155],
       [  12,  162,   59, ...,  380,    7,    7],
       [   0,    0,    0, ...,   50, 1088,   96],
       ...,
       [   0,    0,    0, ...,  125,  200, 3241],
       [   0,    0,    0, ..., 1066,    1, 2305],
       [   0,    0,    0, ...,    1,  332,   27]], dtype=int32)

In [18]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [19]:
Y_train

Unnamed: 0,sentiment
39087,0
30893,0
45278,1
16398,0
13653,0
...,...
11284,1
44732,1
38158,0
860,1


# 3. Model Building and Training

- A sequential LSTM model is built, compiled, and trained on the prepared data.
- The model is compiled with the "adam" optimizer, "binary_crossentropy" as the loss function (suitable for binary classification), and "accuracy" as the evaluation metric.
- The model is trained using model.fit() on the training data (X_train, Y_train). It trains for 5 epochs, using a batch size of 64, and 20% of the training data is set aside for validation during training



In [20]:
model = Sequential()
model.add(Embedding(input_dim =5000, output_dim = 128, input_length = 200))
model.add(LSTM(128, dropout=0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = "sigmoid"))



In [21]:
model.build(input_shape=(None, 200))
model.summary()

In [22]:
model.compile(optimizer = "adam", loss="binary_crossentropy", metrics=["accuracy"])

In [23]:
model.fit(X_train, Y_train, epochs = 5, batch_size = 64, validation_split = 0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 376ms/step - accuracy: 0.7388 - loss: 0.5133 - val_accuracy: 0.8491 - val_loss: 0.3613
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m216s[0m 411ms/step - accuracy: 0.8548 - loss: 0.3497 - val_accuracy: 0.8395 - val_loss: 0.3638
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 379ms/step - accuracy: 0.8793 - loss: 0.2996 - val_accuracy: 0.8694 - val_loss: 0.3278
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 378ms/step - accuracy: 0.8828 - loss: 0.2856 - val_accuracy: 0.8363 - val_loss: 0.3901
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 377ms/step - accuracy: 0.9061 - loss: 0.2359 - val_accuracy: 0.8771 - val_loss: 0.3183


<keras.src.callbacks.history.History at 0x7e1dfc838cd0>

# 4. Model Evaluation and Saving:

- The trained model is evaluated on the test data, and the model and tokenizer are saved.
- The trained model is evaluated on the test data (X_test, Y_test) using model.evaluate() to calculate the loss and accuracy.



In [24]:
model.save("model.h5")



In [25]:
import joblib
joblib.dump(tokenizer, "tokenizer.pkl")

['tokenizer.pkl']

In [26]:
loss, accuracy = model.evaluate(X_test, Y_test)

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 105ms/step - accuracy: 0.8792 - loss: 0.3089


In [27]:
print(loss, accuracy)

0.30450233817100525 0.8834999799728394


# 5. Inference
- A function is defined to perform sentiment prediction on new, unseen reviews using the trained model




In [29]:
def inference(review):
  sequences = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequences, maxlen=200)
  prediction = model.predict(padded_sequence)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [30]:
inference("This movie did not meet my expectations")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 441ms/step


'negative'

In [31]:
inference("I loved this movie!")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step


'positive'