In [39]:
# Loading the necessary packages
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.strings import regex_replace
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, RNN, SimpleRNNCell, GRU, LSTM, Bidirectional, Embedding, Dropout

# Mounting the drive
from google.colab import drive
drive.mount('/content/drive/')

# Restricting the float value of a dataframe to 3 decimal points
pd.set_option('display.float_format', lambda x: '%.3f' % x)
# Removing the limit for the number of displayed columns in a dataframe
pd.set_option("display.max_columns", None)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Data Preparation

In [None]:
# Checking the number of observations in training and test sets
print('No. of positive reviews in the training data:', 
      len(os.listdir('/content/drive/MyDrive/Python_files/movie_reviews_dataset/train/pos')))
print('No. of negative reviews in the training data:', 
      len(os.listdir('/content/drive/MyDrive/Python_files/movie_reviews_dataset/train/neg')))
print('No. of positive reviews in the test data:', 
      len(os.listdir('/content/drive/MyDrive/Python_files/movie_reviews_dataset/test/pos')))
print('No. of negative reviews in the test data:', 
      len(os.listdir('/content/drive/MyDrive/Python_files/movie_reviews_dataset/test/neg')))

No. of positive reviews in the training data: 12500
No. of negative reviews in the training data: 12500
No. of positive reviews in the test data: 12500
No. of negative reviews in the test data: 12500


In [None]:
# Defining a function to fetch the training and testing sets and their labels
def prepareData(dir):
  data = text_dataset_from_directory(dir, batch_size = 32)
  return data.map(
    lambda text, label: (regex_replace(text, '<br />', ' '), label),
  )

## text_dataset_from_directory fetches data from the input directory, put them into batches with the assigned size
## (default = 32), i.e., here, 64 text files together in each batch (group )and set the folders inside the main directoy
## as classes (labels). For instance, neg and pos, or, actually, their integer equivalents (0 and 1) become labels.

In [None]:
# Defining review classes (labels)
classes = {0: 'Negative', 1: 'Positive'}

# Preparaing training and test sets
train_data = prepareData('/content/drive/MyDrive/Python_files/movie_reviews_dataset/train')
test_data  = prepareData('/content/drive/MyDrive/Python_files/movie_reviews_dataset/test')

text_batch, label_batch = next(iter(train_data.take(1)))
print('First review of the training set:\n', text_batch.numpy()[0], sep = '')
print('Label of the review:', classes[label_batch.numpy()[0]])

## .take(count) fetches as many as count batches from dataset and using next(iter()) splits them into
## texts and labels. Here, count = 1, so only one batch is fetched. Note each batch includes 64 texts,
## so [] decides, which one should be printed. numpy() converts tf variables into numpy arrays. 

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.
First review of the training set:
b'I first encountered this show when I was staying in Japan for six months last year. I found it in the internet when I was looking for sub-titled dramas to help me with my Japanese. My host mother warned me to stay away from it because she thought it was weird, but I found it delightful! Koyuki showed such conflicting character traits and Matsujun\'s spirit made my day every time I tuned in! I first saw him on "Hana Yori Dango", but I liked him much better in this!  Although the characters are interesting and well-developed, I was disappointed to find that they didn\'t change very much throughout the show. Their relationship grew, but they didn\'t really. Still, a fun time had by all (Even for Fukushima!).'
Label of the review: Positive


In [None]:
# Shuffling batches of training data
train_set = test_data.shuffle(len(train_data)+1)
# Splitting the testing data into test and validation sets
test_data = test_data.shuffle(len(test_data)+1)
val_set   = test_data.shard(2, 0) 
test_set  = test_data.shard(2, 1) 

## RNN Models

Here, we build RNN models using five different architectures: Simple RNN, RNN/LSTM, RNN/GRU/ Bidirectional LSTM and Bidirectional GRU. At the end, we compare, the performance and predictions of all models to each other.

In [None]:
# Creating a text vectorization layer, converting the input text into numerical vectors
max_tokens = 1000  ## This tokenize only 1000 words and the rest will identified as OOV (out of vocabulary)
max_len = 100      ## maximum length the vector of the quantified texts
vectorize_layer = TextVectorization(
  max_tokens = max_tokens,
  output_mode = "int",
  output_sequence_length = max_len,
)

# Vectorization of texts 
train_texts = train_set.map(lambda text, label: text)
vectorize_layer.adapt(train_texts)   ## Training the text vectorizer by passing the training data to it

### Simple RNN

In [None]:
# Building the model
model_rnn = Sequential()
model_rnn.add(Input(shape = (1,), dtype = 'string'))

## adding built and trained text vectorizer to the model
model_rnn.add(vectorize_layer)  

## This layer receives the vectors of texts and fixes their sizes (at 128)
## max_tokens is the no. of all words in the vocabulary and 1 represents OOV
model_rnn.add(Embedding(max_tokens+1, 128))

# Adding the RNN layers and the last couple of FC layers
model_rnn.add(RNN(SimpleRNNCell(64), return_sequences = False, return_state = False))
model_rnn.add(Dense(64, activation = "relu"))
model_rnn.add(Dense(1, activation = "sigmoid"))

In [None]:
# Setting the model's callbacks 
best_model = tf.keras.callbacks.ModelCheckpoint("best_weights_rnn.tf", verbose = 1, 
                                                save_best_only = True, save_weights_only = True)
early_stop = tf.keras.callbacks.EarlyStopping(patience = 5, verbose = 1)

In [None]:
# Setting the performance metric and loss function of the model
model_rnn.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

# Printing the summary of the model
print(model_rnn.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 100, 128)          128128    
                                                                 
 rnn (RNN)                   (None, 64)                12352     
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 144,705
Trainable params: 144,705
Non-trainable params: 0
__________________________________________________

In [None]:
# Fitting the model 
model_rnn.fit(train_set, validation_data = val_set, epochs = 10,
              callbacks = [best_model, early_stop])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.69004, saving model to best_weights_rnn.tf
Epoch 2/10
Epoch 2: val_loss improved from 0.69004 to 0.66825, saving model to best_weights_rnn.tf
Epoch 3/10
Epoch 3: val_loss improved from 0.66825 to 0.66746, saving model to best_weights_rnn.tf
Epoch 4/10
Epoch 4: val_loss improved from 0.66746 to 0.61466, saving model to best_weights_rnn.tf
Epoch 5/10
Epoch 5: val_loss did not improve from 0.61466
Epoch 6/10
Epoch 6: val_loss improved from 0.61466 to 0.57557, saving model to best_weights_rnn.tf
Epoch 7/10
Epoch 7: val_loss did not improve from 0.57557
Epoch 8/10
Epoch 8: val_loss did not improve from 0.57557
Epoch 9/10
Epoch 9: val_loss did not improve from 0.57557
Epoch 10/10
Epoch 10: val_loss did not improve from 0.57557


<keras.callbacks.History at 0x7f8220564880>

In [None]:
# Loading the best weights into the model
model_rnn.load_weights("best_weights_rnn.tf")

# Evaluating the model performance on the testing set
rnn_loss, rnn_acc = model_rnn.evaluate(test_set)
print('The accuracy of the RNN model is {}.'.format(round(rnn_acc, 3)))

The accuracy of the RNN model is 0.717.


### LSTM

In [None]:
# Building the model
model_lstm = Sequential()
model_lstm.add(Input(shape = (1,), dtype = 'string'))

## adding built and trained text vectorizer to the model
model_lstm.add(vectorize_layer)  

## This layer receives the vectors of texts and fixes their sizes (at 128)
## max_tokens is the no. of all words in the vocabulary and 1 represents OOV
model_lstm.add(Embedding(max_tokens+1, 128))

# Adding the RNN layers and the last couple of FC layers
model_lstm.add(LSTM(64))
model_lstm.add(Dense(64, activation = "relu"))
model_lstm.add(Dense(1, activation = "sigmoid"))

In [None]:
# Setting the model's callbacks 
best_model = tf.keras.callbacks.ModelCheckpoint("best_weights_lstm.tf", verbose = 1, 
                                                save_best_only = True, save_weights_only = True)
early_stop = tf.keras.callbacks.EarlyStopping(patience = 5, verbose = 1)

In [None]:
# Setting the performance metric and loss function of the model
model_lstm.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

# Printing the summary of the model
print(model_lstm.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding_1 (Embedding)     (None, 100, 128)          128128    
                                                                 
 lstm (LSTM)                 (None, 64)                49408     
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 181,761
Trainable params: 181,761
Non-trainable params: 0
________________________________________________

In [None]:
# Fitting the model 
model_lstm.fit(train_set, validation_data = val_set, epochs = 10,
               callbacks = [best_model, early_stop])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.41121, saving model to best_weights_lstm.tf
Epoch 2/10
Epoch 2: val_loss improved from 0.41121 to 0.38212, saving model to best_weights_lstm.tf
Epoch 3/10
Epoch 3: val_loss improved from 0.38212 to 0.35275, saving model to best_weights_lstm.tf
Epoch 4/10
Epoch 4: val_loss improved from 0.35275 to 0.34992, saving model to best_weights_lstm.tf
Epoch 5/10
Epoch 5: val_loss improved from 0.34992 to 0.32306, saving model to best_weights_lstm.tf
Epoch 6/10
Epoch 6: val_loss improved from 0.32306 to 0.29149, saving model to best_weights_lstm.tf
Epoch 7/10
Epoch 7: val_loss improved from 0.29149 to 0.27330, saving model to best_weights_lstm.tf
Epoch 8/10
Epoch 8: val_loss improved from 0.27330 to 0.25442, saving model to best_weights_lstm.tf
Epoch 9/10
Epoch 9: val_loss improved from 0.25442 to 0.23559, saving model to best_weights_lstm.tf
Epoch 10/10
Epoch 10: val_loss improved from 0.23559 to 0.23175, saving model to best_weights_lstm.tf


<keras.callbacks.History at 0x7f821db37460>

In [None]:
# Loading the best weights into the model
model_lstm.load_weights("best_weights_lstm.tf")

# Evaluating the model performance on the testing set
lstm_loss, lstm_acc = model_lstm.evaluate(test_set)
print('The accuracy of the LSTM model is {}.'.format(round(lstm_acc, 3)))

The accuracy of the LSTM model is 0.907.


### GRU

In [None]:
# Building the model
model_gru = Sequential()
model_gru.add(Input(shape = (1,), dtype = 'string'))

## adding built and trained text vectorizer to the model
model_gru.add(vectorize_layer)  

## This layer receives the vectors of texts and fixes their sizes (at 128)
## max_tokens is the no. of all words in the vocabulary and 1 represents OOV
model_gru.add(Embedding(max_tokens+1, 128))

# Adding the LSTM layers and the last couple of FC layers
model_gru.add(GRU(64))
model_gru.add(Dense(64, activation = "relu"))
model_gru.add(Dense(1, activation = "sigmoid"))

In [None]:
# Setting the model's callbacks 
best_model = tf.keras.callbacks.ModelCheckpoint("best_weights_gru.tf", verbose = 1, 
                                                save_best_only = True, save_weights_only = True)
early_stop = tf.keras.callbacks.EarlyStopping(patience = 5, verbose = 1)

In [None]:
# Setting the performance metric and loss function of the model
model_gru.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

# Printing the summary of the model
print(model_gru.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding_2 (Embedding)     (None, 100, 128)          128128    
                                                                 
 gru (GRU)                   (None, 64)                37248     
                                                                 
 dense_4 (Dense)             (None, 64)                4160      
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                                 
Total params: 169,601
Trainable params: 169,601
Non-trainable params: 0
________________________________________________

In [None]:
# Fitting the model 
model_gru.fit(train_set, validation_data = val_set, epochs = 10,
              callbacks = [best_model, early_stop])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.45596, saving model to best_weights_gru.tf
Epoch 2/10
Epoch 2: val_loss improved from 0.45596 to 0.38547, saving model to best_weights_gru.tf
Epoch 3/10
Epoch 3: val_loss improved from 0.38547 to 0.35573, saving model to best_weights_gru.tf
Epoch 4/10
Epoch 4: val_loss improved from 0.35573 to 0.33206, saving model to best_weights_gru.tf
Epoch 5/10
Epoch 5: val_loss improved from 0.33206 to 0.31334, saving model to best_weights_gru.tf
Epoch 6/10
Epoch 6: val_loss improved from 0.31334 to 0.29280, saving model to best_weights_gru.tf
Epoch 7/10
Epoch 7: val_loss improved from 0.29280 to 0.26810, saving model to best_weights_gru.tf
Epoch 8/10
Epoch 8: val_loss improved from 0.26810 to 0.26610, saving model to best_weights_gru.tf
Epoch 9/10
Epoch 9: val_loss improved from 0.26610 to 0.25389, saving model to best_weights_gru.tf
Epoch 10/10
Epoch 10: val_loss improved from 0.25389 to 0.21936, saving model to best_weights_gru.tf


<keras.callbacks.History at 0x7f82198a3b20>

In [None]:
# Loading the best weights into the model
model_gru.load_weights("best_weights_gru.tf")

# Evaluating the model performance on the testing set
gru_loss, gru_acc = model_gru.evaluate(test_set)
print('The accuracy of the GRU model is {}.'.format(round(gru_acc, 3)))

The accuracy of the GRU model is 0.908.


### Bidirectional LSTM

In [None]:
# Building the model
model_bilstm = Sequential()
model_bilstm.add(Input(shape = (1,), dtype = 'string'))

## adding built and trained text vectorizer to the model
model_bilstm.add(vectorize_layer)  

## This layer receives the vectors of texts and fixes their sizes (at 128)
## max_tokens is the no. of all words in the vocabulary and 1 represents OOV
model_bilstm.add(Embedding(max_tokens+1, 128))

# Adding the LSTM layers and the last couple of FC layers
model_bilstm.add(Bidirectional(LSTM(64)))
model_bilstm.add(Dense(64, activation = "relu"))
model_bilstm.add(Dense(1, activation = "sigmoid"))

In [None]:
# Setting the model's callbacks 
best_model = tf.keras.callbacks.ModelCheckpoint("best_weights_bilstm.tf", verbose = 1, 
                                                save_best_only = True, save_weights_only = True)
early_stop = tf.keras.callbacks.EarlyStopping(patience = 4, verbose = 1)

In [None]:
# Setting the performance metric and loss function of the model
model_bilstm.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

# Printing the summary of the model
print(model_bilstm.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding_4 (Embedding)     (None, 100, 128)          128128    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              98816     
 l)                                                              
                                                                 
 dense_8 (Dense)             (None, 64)                8256      
                                                                 
 dense_9 (Dense)             (None, 1)                 65        
                                                                 
Total params: 235,265
Trainable params: 235,265
Non-tr

In [None]:
# Fitting the model 
model_bilstm.fit(train_set, validation_data = val_set, epochs = 10,
                 callbacks = [best_model, early_stop])

Epoch 1/10
Epoch 1: val_loss improved from inf to 0.42020, saving model to best_weights_bilstm.tf
Epoch 2/10
Epoch 2: val_loss improved from 0.42020 to 0.35853, saving model to best_weights_bilstm.tf
Epoch 3/10
Epoch 3: val_loss improved from 0.35853 to 0.33248, saving model to best_weights_bilstm.tf
Epoch 4/10
Epoch 4: val_loss improved from 0.33248 to 0.30781, saving model to best_weights_bilstm.tf
Epoch 5/10
Epoch 5: val_loss improved from 0.30781 to 0.28115, saving model to best_weights_bilstm.tf
Epoch 6/10
Epoch 6: val_loss improved from 0.28115 to 0.24081, saving model to best_weights_bilstm.tf
Epoch 7/10
Epoch 7: val_loss improved from 0.24081 to 0.20397, saving model to best_weights_bilstm.tf
Epoch 8/10
Epoch 8: val_loss improved from 0.20397 to 0.19509, saving model to best_weights_bilstm.tf
Epoch 9/10
Epoch 9: val_loss improved from 0.19509 to 0.14319, saving model to best_weights_bilstm.tf
Epoch 10/10
Epoch 10: val_loss improved from 0.14319 to 0.12865, saving model to best_

<keras.callbacks.History at 0x7f82126c5220>

In [35]:
# Loading the best weights into the model
model_bilstm.load_weights("best_weights_bilstm.tf")

# Evaluating the model performance on the testing set
bilstm_loss, bilstm_acc = model_bilstm.evaluate(test_set)
print('The accuracy of the bidirectional LSTM model is {}.'.format(round(bilstm_acc, 3)))

The accuracy of the bidirectional LSTM model is 0.956.


### Bidirectional GRU

In [None]:
# Building the model
model_bigru = Sequential()
model_bigru.add(Input(shape = (1,), dtype = 'string'))

## adding built and trained text vectorizer to the model
model_bigru.add(vectorize_layer)  

## This layer receives the vectors of texts and fixes their sizes (at 128)
## max_tokens is the no. of all words in the vocabulary and 1 represents OOV
model_bigru.add(Embedding(max_tokens+1, 128))

# Adding the LSTM layers and the last couple of FC layers
model_bigru.add(Bidirectional(GRU(64)))
model_bigru.add(Dense(64, activation = "relu"))
model_bigru.add(Dense(1, activation = "sigmoid"))

In [None]:
# Setting the model's callbacks 
best_model = tf.keras.callbacks.ModelCheckpoint("best_weights_bigru.tf", verbose = 1, 
                                                save_best_only = True, save_weights_only = True)
early_stop = tf.keras.callbacks.EarlyStopping(patience = 4, verbose = 1)

In [None]:
# Setting the performance metric and loss function of the model
model_bigru.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])

# Printing the summary of the model
print(model_bigru.summary())

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 embedding_5 (Embedding)     (None, 100, 128)          128128    
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              74496     
 nal)                                                            
                                                                 
 dense_10 (Dense)            (None, 64)                8256      
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 210,945
Trainable params: 210,945
Non-tr

In [None]:
# Fitting the model 
model_bigru.fit(train_set, validation_data = val_set, epochs = 10,
                callbacks = [best_model, early_stop])

In [37]:
# Loading the best weights into the model
model_bigru.load_weights("best_weights_bigru.tf")

# Evaluating the model performance on the testing set
bigru_loss, bigru_acc = model_bigru.evaluate(test_set)
print('The accuracy of the bidirectional GRU model is {}.'.format(round(bigru_acc, 3)))

The accuracy of the bidirectional GRU model is 0.965.


### Comparison of All Models

In [40]:
# Comparing the loss and accuracy of various models on the test set
compare_df = pd.DataFrame(index = ['RNN', 'LSTM', 'GRU', 'Bidirectional LSTM', 'Bidirectional GRU'],
                          columns = ['Loss', 'Accuracy'])
compare_df['Loss']     = [rnn_loss, lstm_loss, gru_loss, bilstm_loss, bigru_loss]
compare_df['Accuracy'] = [rnn_acc, lstm_acc, gru_acc, bilstm_acc, bigru_acc]
compare_df.sort_values(by = 'Accuracy', ascending = False)

Unnamed: 0,Loss,Accuracy
Bidirectional GRU,0.105,0.965
Bidirectional LSTM,0.131,0.956
GRU,0.22,0.908
LSTM,0.227,0.907
RNN,0.575,0.717


In [41]:
# Checking the prediction of the model on a simple short review
Review1 = "A really amazing movie."
Review2 = "What an absolute garbage, what a waste of time!"
Review3 = "A pretty engaging movie. A bit long and convoluted, but I can still recommend it."
Review4 = "The movie had a great cast and, interesting premise and a very good start. " +  \
          "But the director couldn't deliver the final message, and the storyline collapsed towards the end of the movie."

# Making lists of sample reviews and models          
review_list = [Review1, Review2, Review3, Review4]
model_list  = [model_rnn, model_lstm, model_gru, model_bilstm, model_bigru] 

# Sentiment analysis dataframe
sa_df = pd.DataFrame(index = ['Review 1', 'Review 2', 'Review 3', 'Review 4'], 
                     columns = ['RNN', 'LSTM', 'GRU', 'Bidirectional LSTM', 'Bidirectional GRU', 'Truth'])
sa_df.loc[:, 'Truth'] = ['Positive', 'Negative', 'Positive', 'Negative'] 

# Looping over all models and sample review to store the prediction of models for each review in a dataframe
for j, model in enumerate(model_list):
    for i, review in enumerate(review_list):
        sa_df.iloc[i, j] = classes[int(np.round(model.predict([review], verbose = 0)).flatten())]
# Showing the prediction of all models for all sample reviews
sa_df

Unnamed: 0,RNN,LSTM,GRU,Bidirectional LSTM,Bidirectional GRU,Truth
Review 1,Negative,Positive,Positive,Positive,Positive,Positive
Review 2,Negative,Negative,Negative,Negative,Negative,Negative
Review 3,Negative,Positive,Positive,Positive,Positive,Positive
Review 4,Negative,Negative,Negative,Negative,Negative,Negative
