<a href="https://colab.research.google.com/github/looohaar/Sentiment-Analysis-using-SimpleRNN/blob/main/Movie_Review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Movie Review Sentiment Analysis Using RNN, Part 1: Model building**#





In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

In [3]:
# Load the imdb dataset

max_features=10000 ## Selects the top 10000 most occuring words from the entire dataset. This is also called as the vocabulary size.
(X_train, Y_train), (X_test, Y_test) = imdb.load_data(num_words = max_features)

# Print the shape of the data
print('Training data shape :', {X_train.shape}, 'Training labels shape :', {Y_train.shape})
print('Testing data shape :', {X_test.shape}, 'Testing labels shape :', {Y_test.shape})


Training data shape : {(25000,)} Training labels shape : {(25000,)}
Testing data shape : {(25000,)} Testing labels shape : {(25000,)}


In [5]:
# Integer representation of first sentance from training data.
# These integers are the encoded integer ranks(based of frequency) of the words.
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 5535,
 18,

In [7]:
# Output label of the first sentance from the training data
Y_train[0]

1

In [9]:
# Inspecting a sample input and output
sample_review = X_train[0]
sample_label = Y_train[0]

print('Sample Review (as integers) : ', sample_review)
print('Sample Label : ', sample_label)

Sample Review (as integers) :  [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
Sample Label :  1


## **Not a part of the main project but for a better understanding**

In [11]:
# Mapping of word index back to words (This step is not the part of prject, but for a better understanding)
word_index = imdb.get_word_index()
# word_index

reverse_word_index = {value: key for key, value in word_index.items()}
reverse_word_index

{34701: 'fawn',
 52006: 'tsukino',
 52007: 'nunnery',
 16816: 'sonja',
 63951: 'vani',
 1408: 'woods',
 16115: 'spiders',
 2345: 'hanging',
 2289: 'woody',
 52008: 'trawling',
 52009: "hold's",
 11307: 'comically',
 40830: 'localized',
 30568: 'disobeying',
 52010: "'royale",
 40831: "harpo's",
 52011: 'canet',
 19313: 'aileen',
 52012: 'acurately',
 52013: "diplomat's",
 25242: 'rickman',
 6746: 'arranged',
 52014: 'rumbustious',
 52015: 'familiarness',
 52016: "spider'",
 68804: 'hahahah',
 52017: "wood'",
 40833: 'transvestism',
 34702: "hangin'",
 2338: 'bringing',
 40834: 'seamier',
 34703: 'wooded',
 52018: 'bravora',
 16817: 'grueling',
 1636: 'wooden',
 16818: 'wednesday',
 52019: "'prix",
 34704: 'altagracia',
 52020: 'circuitry',
 11585: 'crotch',
 57766: 'busybody',
 52021: "tart'n'tangy",
 14129: 'burgade',
 52023: 'thrace',
 11038: "tom's",
 52025: 'snuggles',
 29114: 'francesco',
 52027: 'complainers',
 52125: 'templarios',
 40835: '272',
 52028: '273',
 52130: 'zaniacs',

In [12]:
decoded_review =  ' '.join([reverse_word_index.get(i-3, '?') for i in sample_review])
decoded_review

"? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you th

## **Back to the Project**

In [13]:
#  Each review will be truncated to 500 words or padded to ensure a uniform length of 500 words.
# It ensures that all reviews (which are represented as sequences of integers) have the same length, making them suitable for batch processing in a neural network
max_len = 500

X_train = sequence.pad_sequences(X_train, maxlen = max_len)
X_test = sequence.pad_sequences(X_test, maxlen = max_len)

In [15]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [17]:
# Training a Simple RNN

# Initializes a Sequential model, which is a linear stack of layers
model = Sequential()

# Add an Embedding Layer
# This is the Embedding layer and is responsible for taking all the words in the vocabulary(ie, max_features) into vectors of 128 dimensions. Here we use dimension as 128, but you can use any.
# Each word will be represented by a 128-dimensional vector (array of 128 elements).
# These 128 elements holds float value of different words from the vocabulary  which may containt the same of relatable meaning.
model.add(Embedding(max_features, 128, input_length = max_len))

# Add a Simple RNN Layer
# RNN layers are used for processing sequences of data, such as time series or text, where the order of data points matters.
# It captures the temporal dynamics of the sequence data.
# 128 is the number of RNN units in the layer. Each unit output a 128-dimensional vector.
# There is no need to keeping the no. of dimensions and no. of units same. We randomly assigned this value.
model.add(SimpleRNN(128, activation = 'relu'))

# Add a Dense Layer
# The Dense layer with a sigmoid activation function s often used as the final layer or output layer in a binary classification task.
# This layer is assigned with 1 neuron as we are dealing with a binary classification.
model.add(Dense(1, activation = 'sigmoid'))


In [19]:
model.summary()
# Param refers to the total weights and bias in each layer.
# Note that Embedding layer will cotains only weights.

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 128)          1280000   
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 128)               32896     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1313025 (5.01 MB)
Trainable params: 1313025 (5.01 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
# Configure the learning process of the model

# The optimizer controls how the model is updated based on the computed gradients.
# 'adam' is a popular and efficient optimizer that adapts the learning rate during training.

# Loss Function (loss='binary_crossentropy')
# 'binary_crossentropy' is typically used for binary classification tasks.

# Metrics (metrics=['accuracy'])
# Metrics are used to monitor the performance of the model. 'accuracy' is a common metric used in classification problems to measure the proportion of correctly classified samples

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

# This model.compile() function prepares the model for training by specifying these key aspects of the learning process. After compiling, the model is ready to be trained using model.fit() with appropriate training data and labels.


In [22]:
# Create an instance of EarlyStopping Callback

from tensorflow.keras.callbacks import EarlyStopping

# Early stopping helps prevent overfitting by stopping training when the model's performance on a validation set starts to degrade.

# monitor='val_loss': Specifies the quantity to monitor for improvement. In this case, it's monitoring the validation loss (val_loss). Training will stop when the validation loss stops improving

# patience=5: Number of epochs with no improvement after which training will be stopped. In this example, training will stop after 5 epochs if val_loss does not improve.

# restore_best_weights=True: Whether to restore the model weights from the epoch with the best value of the monitored quantity (in this case, val_loss). This ensures that the model returned to its best state when training stops, rather than using the final state.

earlystopping = EarlyStopping(monitor = 'val_loss', patience = 5, restore_best_weights = True)
earlystopping

<keras.src.callbacks.EarlyStopping at 0x7c3f06d71870>

In [23]:
# epochs=10: Specifies the number of epochs (iterations over the entire training dataset) to train the model
# batch_size=32: Determines the number of samples processed before the model's internal parameters (weights) are updated. Here, 32 samples will be processed together before updating the model.
# validation_split=0.2: Specifies that 20% of the training data (X_train and y_train) will be used as validation data.


history = model.fit(
    X_train, Y_train,
    epochs = 10,
    batch_size = 32,
    validation_split = 0.2,
    callbacks = [earlystopping]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


**Training Metrics** (loss and accuracy): These metrics (loss and accuracy) reflect how well the model is learning from the training data during each epoch.


**Validation Metrics** (val_loss and val_accuracy): These metrics (val_loss and val_accuracy) evaluate the model's performance on unseen validation data. They help assess how well the model generalizes to new data.

In [24]:
# Save model file
model.save('movie_review_rnn.h5')

  saving_api.save_model(


# **Part 2: Predictions**

In [25]:
from tensorflow.keras.models import load_model


In [26]:
# Load the IMDB dataset word index
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}
reverse_word_index

{34701: 'fawn',
 52006: 'tsukino',
 52007: 'nunnery',
 16816: 'sonja',
 63951: 'vani',
 1408: 'woods',
 16115: 'spiders',
 2345: 'hanging',
 2289: 'woody',
 52008: 'trawling',
 52009: "hold's",
 11307: 'comically',
 40830: 'localized',
 30568: 'disobeying',
 52010: "'royale",
 40831: "harpo's",
 52011: 'canet',
 19313: 'aileen',
 52012: 'acurately',
 52013: "diplomat's",
 25242: 'rickman',
 6746: 'arranged',
 52014: 'rumbustious',
 52015: 'familiarness',
 52016: "spider'",
 68804: 'hahahah',
 52017: "wood'",
 40833: 'transvestism',
 34702: "hangin'",
 2338: 'bringing',
 40834: 'seamier',
 34703: 'wooded',
 52018: 'bravora',
 16817: 'grueling',
 1636: 'wooden',
 16818: 'wednesday',
 52019: "'prix",
 34704: 'altagracia',
 52020: 'circuitry',
 11585: 'crotch',
 57766: 'busybody',
 52021: "tart'n'tangy",
 14129: 'burgade',
 52023: 'thrace',
 11038: "tom's",
 52025: 'snuggles',
 29114: 'francesco',
 52027: 'complainers',
 52125: 'templarios',
 40835: '272',
 52028: '273',
 52130: 'zaniacs',

In [28]:
# Load the pre-trained model with ReLU activation
model = load_model('/content/movie_review_rnn.h5')
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 128)          1280000   
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 128)               32896     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1313025 (5.01 MB)
Trainable params: 1313025 (5.01 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
model.get_weights()

[array([[-0.05136453, -0.06409454, -0.06883811, ..., -0.00999966,
          0.02564842,  0.00115249],
        [-0.02005166, -0.03682107, -0.0514275 , ...,  0.01696279,
         -0.00165747,  0.02143122],
        [ 0.01098711,  0.02994527,  0.00759724, ..., -0.03759082,
          0.06852321,  0.00342441],
        ...,
        [ 0.08878981,  0.02865462,  0.01015122, ...,  0.0213025 ,
         -0.01912814,  0.0631443 ],
        [-0.0252104 ,  0.06665897,  0.05543051, ..., -0.04282226,
         -0.00161535, -0.02916675],
        [ 0.08089982,  0.15826869,  0.19161074, ..., -0.04466639,
          0.03560567,  0.07815729]], dtype=float32),
 array([[-0.02173153,  0.11470386, -0.09498531, ..., -0.051191  ,
         -0.11966562,  0.12254824],
        [-0.05373498,  0.08900211, -0.1007752 , ...,  0.16416097,
          0.07295176,  0.09355035],
        [-0.04093937, -0.1418858 ,  0.09705372, ...,  0.00248136,
          0.03620081,  0.06647537],
        ...,
        [-0.0374964 , -0.00901242, -0.1

In [55]:
# Define helper Functions

# Function to decode reviews
def decode_reviews(encoded_review):
  return' '.join([reverse_word_index.get(i-3, '?') for i in  encoded_review])

# Function to Preprocess user input
def preprocess_user_input(user_input):
  words = user_input.lower().split()
  encoded_review = [word_index.get(word, 2) + 3 for word in words]
  encoded_review = [index if index < 10003 else 2 for index in encoded_review]  # Replace unknown words with the index for 'unknown'
  padded_review = sequence.pad_sequences([encoded_review], maxlen = 500)
  return padded_review

In [56]:
# Function to Predict the result
def predict_sentiment(review):
  preprocessed_input = preprocess_user_input(review)
  prediction = model.predict(preprocessed_input)
  sentiment = 'Positive' if prediction[0][0] > 0.5 else 'Negative'

  return sentiment, prediction[0][0]



In [71]:
# User Input and Prediction

review= 'terrific'
sentiment, score = predict_sentiment(review)

print('Review : ', review)
print('Sentiment : ', sentiment)
print('Prediction Score : ', score)


Review :  terrific
Sentiment :  Positive
Prediction Score :  0.6680916
