In [1]:
# To import required libraries

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

import nltk                                   # To use the NLTK (Natural Language Toolkit) library, particularly for the tokenization NLP task
nltk.download('stopwords')                    # To download the list of stopwords (i.e., common words) from the NLTK corpus
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))  # To create a list of stopwords

from collections import Counter # Counter is a subclass of dict that is used to count occurrences of elements in an iterable
import string                   # Contains functions and constants related to string processing
import re                       # The re module provides support for regular expressions (pattern matching, searching, and modifying within strings)
import seaborn as sns           # Seaborn is a Python data visualization library for statistical data visualization

from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split # To use sklearn libraries to split train and test dataset

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Device Initialization

if torch.cuda.is_available:
    device = torch.device("cuda")
    print("GPU assigned for processing!\n")
else:
    device = torch.device("cpu")
    print("CPU assigned for processing!\n")

GPU assigned for processing!



In [3]:
# Task 1: To load and explore the dataset (Columns are identified)

imdb_csv_file = '/content/drive/MyDrive/Notebooks_DL/IMDB_dataset/IMDB_Dataset.csv'
df = pd.read_csv(imdb_csv_file)
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [4]:
# Task 1: To split Dataset into train and test (i.e., 80-20)

x, y = df['review'].values, df['sentiment'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)
print(f'shape of train data is {x_train.shape}')
print(f'shape of test data is {x_test.shape}\n')

shape of train data is (40000,)
shape of test data is (10000,)



In [5]:
# Task 1: The function to refine the input word by removing non-word characters, white spaces, and digits exploiting "re"

def preprocess_string(s):
    s = re.sub(r"[^\w\s]", '', s) # To replace all non-word characters (everything except numbers and letters) with no-space
    s = re.sub(r"\s+", '', s)     # To replace all runs of whitespaces with no-space
    s = re.sub(r"\d", '', s)      # To replace digits with no-space
    return s

In [6]:
# Task 1: The function to pad text sequences (Generally it pads a list of sequences to a fixed length (seq_len) ensuring all sequences have the same length)

def padding (sentences, seq_len):
    features = np.zeros((len(sentences), seq_len), dtype=int)
    for idx, review in enumerate(sentences): # review is a list of numerical tokens representing a sentence
        if len(review) != 0:                 # Ensures the sentence is not empty
            features[idx, -len(review):] = np.array(review)[:seq_len]
    return features

# If review is shorter than seq_len → It is right-aligned, with zeros in the front.
# If review is longer than seq_len → It is truncated from the front, keeping the last seq_len tokens.

In [7]:
# Task 1: The function to tockenize and pad text sequences using preprocess_string and padding functions

def tockenize(x_train, y_train, x_val, y_val, stop_words):

    word_list = []
    for sent in x_train:
        for word in sent.lower().split():
            word = preprocess_string(word)            # First, it removes non-word characters, white spaces, and digits from each word of a sentence
            if word not in stop_words and word != '': # Second, it stores the word if it is not a stop_word or common_word
                word_list.append(word)

    corpus = Counter(word_list)                                         # Counts the occurrences of each word in word_list and stores the result as corpus dictionary
    corpus_sorted = sorted(corpus, key=corpus.get, reverse=True)[:1000] # Then it sorts on the basis of most common words and keeps only top 1000 most frequent words
    one_hot_dict = {w:i+1 for i, w in enumerate(corpus_sorted)}         # Then it creates a dictionary that maps each word in corpus_sorted to a unique integer index
    print(corpus)
    print(corpus_sorted)
    print(one_hot_dict)

    # Tockenize converts a list of sentences (x_train) into a list of numerical indices, mapping words to their corresponding integer values in one_hot_dictionary
    # It is a common preprocessing step in Natural Language Processing (NLP) before feeding data into machine learning models
    final_list_train = []
    final_list_test  = []
    for sent in x_train:
        final_list_train.append([one_hot_dict[preprocess_string(word)] for word in sent.lower().split() if preprocess_string(word) in one_hot_dict.keys()])

    for sent in x_val:
        final_list_test.append([one_hot_dict[preprocess_string(word)] for word in sent.lower().split() if preprocess_string(word) in one_hot_dict.keys()])

    encoded_train = [1 if label =='positive' else 0 for label in y_train]
    encoded_test  = [1 if label =='positive' else 0 for label in y_val]

    # Padding to make all sequences of equal length

    # max_length = max(len(seq) for seq in final_list_train + final_list_test)
    max_length = 500
    print(f"The maximum sequence length is {max_length}")

    x_train_padded = padding (final_list_train, max_length)
    x_test_padded  = padding (final_list_test,  max_length)

    return np.array(x_train_padded), np.array(encoded_train), np.array(x_test_padded), np.array(encoded_test), one_hot_dict


In [8]:
# Task 1: Apply the defined tockenization function
x_train_padded, y_train_encoded, x_test_padded, y_test_encoded, vocab = tockenize(x_train, y_train, x_test, y_test, stop_words)

print(f'\nLength of vocabulary is {len(vocab)} and it is sorted as most frequently used words first')

['br', 'movie', 'film', 'one', 'like', 'good', 'even', 'would', 'time', 'really', 'see', 'story', 'much', 'well', 'get', 'also', 'great', 'bad', 'people', 'first', 'dont', 'movies', 'make', 'made', 'films', 'could', 'way', 'characters', 'think', 'watch', 'many', 'seen', 'never', 'character', 'two', 'love', 'acting', 'plot', 'little', 'know', 'best', 'show', 'life', 'ever', 'better', 'still', 'say', 'end', 'scene', 'man', 'scenes', 'something', 'go', 'im', 'back', 'watching', 'real', 'didnt', 'doesnt', 'thing', 'actors', 'years', 'another', 'though', 'actually', 'funny', 'makes', 'find', 'nothing', 'look', 'going', 'lot', 'work', 'new', 'every', 'old', 'us', 'part', 'cant', 'director', 'thats', 'cast', 'want', 'pretty', 'quite', 'things', 'seems', 'around', 'young', 'got', 'however', 'take', 'enough', 'fact', 'world', 'ive', 'big', 'give', 'horror', 'thought', 'may', 'always', 'long', 'without', 'isnt', 'gets', 'saw', 'music', 'right', 'almost', 'must', 'come', 'times', 'original', 'the

In [9]:
# Task 1: Data Preprocessing (Convert NumPy arrays to Tensors to define PyTorch DataLoader)

train_data = TensorDataset(torch.from_numpy(x_train_padded), torch.from_numpy(y_train_encoded))
valid_data = TensorDataset(torch.from_numpy(x_test_padded), torch.from_numpy(y_test_encoded))

batch_size = 50
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

# Data Size Explaoration
print(f"Length train_loader is: {len(train_loader)}")
print(f"Length test_loader is: {len(valid_loader)}")
print(f"Each Sample has the size of: {train_data[0][0].size()}")

Length train_loader is: 800
Length test_loader is: 200
Each Sample has the size of: torch.Size([500])


In [10]:
# Task 2: Build an RNN Model (Utilizing LSTM, SimpleRNN, GRU, Dropout, Linear, and Sigmoid Layers)

class SentimentRNN(nn.Module):
    def __init__(self, n_layers, vocab_size, output_dim, hidden_dim, embedding_dim, drop_prob=0.3):
        super(SentimentRNN,self).__init__()

        self.n_layers   = n_layers
        self.vocab_size = vocab_size
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.drop_prob  = drop_prob

        # Embedding layer
        # Converts categorical word indices (sparse and high-dimensional) into meaningful dense vectors of fixed lower size
        # Learns embeddings during training (instead of using one-hot encoding). Captures semantic relationships between words
        # Output: batch_size * sequence_length * embedding_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layer
        # Takes input embeddings (word representations from nn.Embedding)
        # Processes them sequentially (one step at a time)
        # Updates the hidden state (h_t) and cell state (C_t) at each step
        # Passes the final hidden state to the next layer or for prediction

        # self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_dim, num_layers=n_layers, batch_first=True)
        # self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=self.hidden_dim, num_layers=n_layers, batch_first=True)
        self.gru = nn.GRU(input_size=embedding_dim, hidden_size=self.hidden_dim, num_layers=n_layers, batch_first=True)

        # Dropout layer
        self.dropout = nn.Dropout(drop_prob)

        # Linear layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)

        # Sigmoid layer
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):

        batch_size = x.size(0)

        # Embeddings
        embeds = self.embedding(x)

        # LSTM feeding
        # The input and output tensors in LSTM layer are provided as (batch, seq, feature), so we need to flatten the output
        # lstm_out, hidden = self.lstm(embeds, hidden)
        # lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # rnn_out, hidden = self.rnn(embeds, hidden)
        # rnn_out = rnn_out.contiguous().view(-1, self.hidden_dim)
        gru_out, hidden = self.gru(embeds, hidden)
        gru_out = gru_out.contiguous().view(-1, self.hidden_dim)

        # dropout and fully connected layer
        # out = self.dropout(lstm_out)
        # out = self.dropout(rnn_out)
        out = self.dropout(gru_out)
        out = self.fc(out)
        sig_out = self.sig(out)

        # Reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)

        # To get the last batch of labels
        sig_out = sig_out[:, -1]

        # Returns the last sigmoid output (i.e., predicted lables) and hidden state
        return sig_out, hidden

    '''
    def init_hidden(self, batch_size):

        # Initializes the hidden state and cell state of the LSTM
        h0 = torch.zeros((self.n_layers, batch_size, self.hidden_dim)).to(device)
        c0 = torch.zeros((self.n_layers, batch_size, self.hidden_dim)).to(device)
        hidden = (h0, c0)

        return hidden
    '''

    def init_hidden(self, batch_size):
        h0 = torch.zeros((self.n_layers, batch_size, self.hidden_dim)).to(device)
        return h0  # SimpleRNN and GRU don't have a cell state

In [11]:
# Task 2: Initialize the defined RNN Model and Optimizer

n_layers = 2                # Number of stacked LSTM layers
vocab_size = len(vocab) + 1 # Plus 1 for padding
embedding_dim = 64          # The size of each embedded vector (embedding size)
output_dim = 1              # The number of output classes (e.g., 1 for binary classification)
hidden_dim = 256            # The number of features in the hidden state, or number of hidden units in each LSTM cell
drop_prob= 0.2

model = SentimentRNN(n_layers, vocab_size, output_dim, hidden_dim, embedding_dim, drop_prob)
model.to(device)
print(model)

LR = 0.001               # Learning Rate
criterion = nn.BCELoss() # Loss function: Binary Cross Entropy loss function
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# A function to calculate accuracy
def acc(pred, label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

SentimentRNN(
  (embedding): Embedding(1001, 64)
  (gru): GRU(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [None]:
# Task 3: Train the RNN Model
# Task 4: Evaluate and Test the Model

clip_value = 5
epochs = 10
valid_loss_min = np.Inf

# To track accuracy and loss for Train and Validation phases
epoch_tr_loss, epoch_vl_loss = [],[]
epoch_tr_acc,  epoch_vl_acc  = [],[]

for epoch in range(epochs):

    ### Training Phase starts here
    train_losses = []
    train_acc = 0.0
    model.train()

    # To initialize hidden state (Initialize once, update dynamically)
    h = None

    for inputs, labels in train_loader:

        inputs, labels = inputs.to(device), labels.to(device)

        batch_size = inputs.size(0)  # Get actual batch size (may be < full batch size in the last batch)

        # If h is None (first batch) or batch size has changed (last batch), reinitialize
        if h is None or batch_size != h.size(1):
              h = model.init_hidden(batch_size).to(device)
        '''else:
              h = h[:, :batch_size, :].contiguous()  # Ensures correct size'''

        # h = tuple([each.data for each in h]) # Correct for LSTM
        h = h.data # Correct for SimpleRNN and GRU

        model.zero_grad()

        output, h = model(inputs, h)
        loss = criterion(output.squeeze(), labels.float()) # Calculate the loss and perform backpropagation
        loss.backward()
        train_losses.append(loss.item())
        correct = acc(output, labels) # Calculating Correct Predictions
        train_acc += correct

        # clip_grad_norm helps to prevent the exploding of gradients RNNs/LSTMs
        nn.utils.clip_grad_norm_(model.parameters(), clip_value)
        optimizer.step()

    ### Validation Phase starts here
    val_losses = []
    val_acc = 0.0
    model.eval()
    h_val = None  # Initialize once, update dynamically

    for inputs, labels in valid_loader:

        inputs, labels = inputs.to(device), labels.to(device)

        batch_size = inputs.size(0)  # Get actual batch size (may be < full batch size in the last batch)

        # If h_val is None (first batch) or batch size has changed (last batch), reinitialize
        if h_val is None or batch_size != h_val.size(1):
              h_val = model.init_hidden(batch_size).to(device)
        '''else:
              h_val = h_val[:, :batch_size, :].contiguous() # Ensures correct size'''

        # h_val = tuple([each.data for each in h_val]) # Correct for LSTM
        h_val = h_val.detach() # Correct for SimpleRNN and GRU

        output, h_val = model(inputs, h_val)

        val_loss = criterion(output.squeeze(), labels.float())
        val_losses.append(val_loss.item())

        correct = acc(output, labels)
        val_acc += correct

    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss   = np.mean(val_losses)
    epoch_train_acc  = train_acc/len(train_loader.dataset)
    epoch_val_acc    = val_acc/len(valid_loader.dataset)

    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)

    print(f'Epoch {epoch+1}')
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')

    if epoch_val_loss <= valid_loss_min:
      print('Validation loss decreased from {:.6f} to {:.6f}. Saving model ...'.format(valid_loss_min, epoch_val_loss))
      valid_loss_min = epoch_val_loss

    print(40*'==')


Epoch 1
train_loss : 0.4705791646242142 val_loss : 0.37321629270911216
train_accuracy : 76.79249999999999 val_accuracy : 83.37
Validation loss decreased from inf to 0.373216. Saving model ...
Epoch 2
train_loss : 0.3280681992881 val_loss : 0.3351898612827063
train_accuracy : 85.91250000000001 val_accuracy : 86.22999999999999
Validation loss decreased from 0.373216 to 0.335190. Saving model ...
Epoch 3
train_loss : 0.2887182161025703 val_loss : 0.3272768373042345
train_accuracy : 87.81 val_accuracy : 86.35000000000001
Validation loss decreased from 0.335190 to 0.327277. Saving model ...
Epoch 4
train_loss : 0.24519978249445556 val_loss : 0.34472985461354255
train_accuracy : 89.895 val_accuracy : 86.06


In [None]:
# Demonstration

fig = plt.figure(figsize = (20, 6))
plt.subplot(1, 2, 1)
plt.plot(epoch_tr_acc, label='Training Acc')
plt.plot(epoch_vl_acc, label='Validation Acc')
plt.title("Accuracy")
plt.legend()
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(epoch_tr_loss, label='Training loss')
plt.plot(epoch_vl_loss, label='Validation loss')
plt.title("Loss")
plt.legend()
plt.grid()

plt.show()

In [None]:
# Demonstration using AUC and ROC Plot

import sklearn.metrics as metrics
from sklearn.metrics import roc_auc_score
def plot_auc_roc(model,valid_loader, version='title', threshold=0.5):
    y_pred = []
    y_true = []
    model.eval()
    with torch.no_grad():
        for inputs, labels in valid_loader :

            batch_size = inputs.size(0)

            h_val = model.init_hidden(batch_size)

            # h_val = tuple([each.data for each in h_val]) # Correct for SimpleRNN
            h_val = h_val.detach() # Correct for SimpleRNN and GRU

            inputs, labels = inputs.to(device), labels.to(device)

            output, h_val = model(inputs, h_val)

            output = (output > threshold).int()
            y_pred.extend(output.tolist())
            y_true.extend(labels.tolist())

    print('AUC ROC :')
    fpr, tpr, threshold = metrics.roc_curve(y_true, y_pred)
    roc_auc = metrics.auc(fpr, tpr)

    print(roc_auc)
    print('----------------------------------------------------------')

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
plot_auc_roc(model, valid_loader)

In [None]:
# Task 4: Test the Model by grabing random reviews from

def predict_text(text):
        word_seq = np.array([vocab[preprocess_string(word)] for word in text.split()
                         if preprocess_string(word) in vocab.keys()])
        word_seq = np.expand_dims(word_seq, axis=0)
        pad =  torch.from_numpy(padding (word_seq, 500))
        inputs = pad.to(device)
        batch_size = 1
        h = model.init_hidden(batch_size)
        # h = tuple([each.data for each in h]) # Correct for LSTM
        h = h.detach() # Correct for SimpleRNN and GRU
        output, h = model(inputs, h)
        return output.item()

index = 30
print(df['review'][index])
print('='*70)
print(f'Actual sentiment is  : {df["sentiment"][index]}')
print('='*70)
score = predict_text(df['review'][index])
status = "positive" if score > 0.5 else "negative"
score = (1 - score) if status == "negative" else score
print(f'Predicted sentiment is {status} with a probability of {score}\n')

index = 32
print(df['review'][index])
print('='*70)
print(f'Actual sentiment is  : {df["sentiment"][index]}')
print('='*70)
prob = predict_text(df['review'][index])
status = "positive" if prob > 0.5 else "negative"
prob = (1 - prob) if status == "negative" else prob
print(f'predicted sentiment is {status} with a probability of {prob}\n')