In [1]:
#import required modules
import os
import re
import time
import nltk
import torch

import numpy as np
import pandas as pd
from torch import nn
import seaborn as sns
from tqdm import tqdm

import torch.nn.utils.prune as prune
import torch.nn.functional as F
from torch.optim import Adam
from bs4 import BeautifulSoup
from collections import Counter
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import word_tokenize, sent_tokenize
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
#import csv file using pandas
df = pd.read_csv('IMDB Dataset.csv', encoding='utf-8')

#defined a transformation fn to change negative values as zero and positive as one
def transform_lbl(label):
    return 1 if label == 'positive' else 0

# progress bar for pandas functions
# reference: https://www.kdnuggets.com/2022/09/progress-bars-python-tqdm-fun-profit.html
# https://towardsdatascience.com/progress-bars-in-python-and-pandas-f81954d33bae
tqdm.pandas()

# apply the transformation fn on the IMDb dataset
df['label'] = df['sentiment'].progress_apply(transform_lbl)

# check if the changes have been applied
df.head()

100%|█████████████████████████████████| 50000/50000 [00:00<00:00, 350879.73it/s]


Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


## Cleaning up the Dataset
Need to remove the following things: 
- HTML Marks including square brackets in the text
- Contracted Words
- Extra White Space
- Stemming Words
- Stopwords

References used: 

https://towardsdatascience.com/nlp-building-text-cleanup-and-preprocessing-pipeline-eba4095245a0

https://www.kdnuggets.com/2018/03/text-data-preprocessing-walkthrough-python.html

https://lzone.de/examples/Python%20re.sub

https://medium.com/@yashj302/text-cleaning-using-regex-python-f1dded1ac5bd

In [3]:
# download the NLTK resources needed
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/smelany/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/smelany/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/smelany/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/smelany/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
# Define your stopwords set
stopwords_set = set(stopwords.words('english'))

# removing hyperlinks and URLs
def rm_link(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

# removing HTML tags
def rm_html(text):
    return re.sub(r'<[^>]+>', '', text)

# removing regular punctuations
def rm_punct2(text):
    return re.sub(r'[\"\#\$\%\&\'\(\)\*\+\/\:\;\<\=\>\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)

# add spacing between punctations marks
def space_bt_punct(text):
    pattern = r'([.,!?-])'
    s = re.sub(pattern, r' \1 ', text)
    s = re.sub(r'\s{2,}', ' ', s)
    return s

# remove number 
def rm_number(text):
    return re.sub(r'\d+', '', text)

# remove any additional white spaces
def rm_whitespaces(text):
    return re.sub(r' +', ' ', text)

# remove NONASCII characters
def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)

# remove emojis from text
def rm_emoji(text):
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'
        u'\U0001F300-\U0001F5FF'
        u'\U0001F680-\U0001F6FF'
        u'\U0001F1E0-\U0001F1FF'
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    return emojis.sub(r'', text)

# removes repeated characters - e.g. 'heeellllooo' will be 'hello'
def spell_correction(text):
    return re.sub(r'(.)\1+', r'\1\1', text)

#tokenize
def tokenize(text):
    return word_tokenize(text)

#removing stopwords
def rm_stopwords(text):
    return [i for i in text if i.lower() not in stopwords_set]

# lemmatize text
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(t) for t in text]
    return rm_stopwords(lemmas)

# function to cleanup the text
def cleanup(text):
    no_link = rm_link(text)
    no_html = rm_html(no_link)
    space_punct = space_bt_punct(no_html)
    no_punct = rm_punct2(space_punct)
    no_number = rm_number(no_punct)
    no_whitespaces = rm_whitespaces(no_number)
    no_nonasci = rm_nonascii(no_whitespaces)
    no_emoji = rm_emoji(no_nonasci)
    spell_corrected = spell_correction(no_emoji)
    return spell_corrected

# function to preprocess the text
def preprocess_pipeline(text):
    tokens = tokenize(text)
    no_stopwords = rm_stopwords(tokens)
    lemmas = lemmatize(no_stopwords)
    return ' '.join(lemmas)

In [5]:
# apply cleanup function on the text
df['clean'] = df['review'].progress_apply(cleanup)

#apply 
df['preprocessed'] = df['clean'].progress_apply(preprocess_pipeline)

df.head()

100%|███████████████████████████████████| 50000/50000 [00:29<00:00, 1668.89it/s]
100%|████████████████████████████████████| 50000/50000 [02:46<00:00, 300.34it/s]


Unnamed: 0,review,sentiment,label,clean,preprocessed
0,One of the other reviewers has mentioned that ...,positive,1,One of the other reviewers has mentioned that ...,One reviewer mentioned watching Oz episode hoo...
1,A wonderful little production. <br /><br />The...,positive,1,A wonderful little production . The filming te...,wonderful little production . filming techniqu...
2,I thought this was a wonderful way to spend ti...,positive,1,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,0,Basically there s a family where a little boy ...,Basically family little boy Jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,Petter Mattei s Love in the Time of Money is a...,Petter Mattei Love Time Money visually stunnin...


In [6]:
# save new dataframe with preprocessed text as a new csv file
df.to_csv('IMDb_processed.csv', index=False)

In [7]:
# focusing only on the processed text and the labels
# keep only processed and label columns
df[['preprocessed', 'label']].to_csv('./imdb_0.csv', index=False, header=True)

Padding and Mapping of words

In [8]:
# path to your processed CSV file
data = pd.read_csv('./imdb_0.csv')

# Iterating over the first two rows of the DataFrame
for row in data[:2].iterrows():
    # Printing the 'processed' column
    print(row[1]['preprocessed'])
    
    # Printing the 'label' column
    print(f'Label: {row[1]["label"]}')
    
    # Printing a newline for better readability between rows
    print('\n')

One reviewer mentioned watching Oz episode hooked . right , exactly happened . first thing struck Oz brutality unflinching scene violence , set right word GO . Trust , show faint hearted timid . show pull punch regard drug , sex violence . hardcore , classic use word . called OZ nickname given Oswald Maximum Security State Penitentary . focus mainly Emerald City , experimental section prison cell glass front face inwards , privacy high agenda . Em City home many . . Aryans , Muslims , gangsta , Latinos , Christians , Italians , Irish . . . . scuffle , death stare , dodgy dealing shady agreement never far away . would say main appeal show due fact go show dare . Forget pretty picture painted mainstream audience , forget charm , forget romance . . . OZ mess around . first episode ever saw struck nasty surreal , say ready , watched , developed taste Oz , got accustomed high level graphic violence . violence , injustice crooked guard sold nickel , inmate kill order get away , well mannered

In [9]:
# get all processed reviews
reviews = data.preprocessed.values

# merge into single variable, separated by whitespaces
words = ' '.join(reviews)

# obtain list of words
words = words.split()

In [10]:
# check list
words[:5]

['One', 'reviewer', 'mentioned', 'watching', 'Oz']

In [11]:
# building a vocabulary and creating mappings between words and integers using a Counter 
# count the occurrences of words.

# counting the occurrences of each word
counter = Counter(words)

# sorting the words by their frequency in descending order
vocab = sorted(counter, key=counter.get, reverse=True)

# Create a mapping from integer to word
int2word = dict(enumerate(vocab, 1))

# Add a special token for padding at index 0
int2word[0] = '<PAD>'

# creating a mapping from word to integer
word2int = {word: id for id, word in int2word.items()}

In [12]:
# encoding the words in the reviews text
# by using the word-to-integer mapping to replace words with their integer indices
reviews_encoded = [[word2int[word] for word in review.split()] for review in tqdm(reviews)]

100%|██████████████████████████████████| 50000/50000 [00:02<00:00, 18577.58it/s]


In [13]:
# showing the 10 encoded words for the first 5 reviews 
for i in range(5):
    print(f"Review {i + 1}: {reviews_encoded[i][:10]}")

Review 1: [172, 1023, 873, 70, 3614, 167, 2924, 1, 102, 2]
Review 2: [300, 50, 220, 1, 1204, 1535, 17655, 3, 74, 3]
Review 3: [84, 300, 26, 983, 10, 788, 1697, 2532, 2, 1067]
Review 4: [2314, 121, 50, 217, 3121, 33, 607, 4182, 542, 825]
Review 5: [70690, 10390, 1021, 1859, 7387, 2190, 1248, 5, 40, 1]


In [14]:
# Padding sequences function - reviews do not have the same length so we will need set a max sequence length
def pad_features(reviews, pad_id, seq_length=128):
    # Creating a matrix to store the padded features
    features = np.full((len(reviews), seq_length), pad_id, dtype=int)
    # Looping through each review and pad or trim to the specified sequence length
    for i, row in enumerate(reviews):
        # If the review is longer than seq_length, trim it
        features[i, :len(row)] = np.array(row)[:seq_length]
    return features

# Set the desired sequence length
seq_length = 256

# Padding the encoded reviews using the pad_features function
features = pad_features(reviews_encoded, pad_id=word2int['<PAD>'], seq_length=seq_length)

In [15]:
# Assertions to check the dimensions
assert len(features) == len(reviews_encoded)
assert len(features[0]) == seq_length

# Print the first 10 rows and columns for inspection
print(features[:10, :10])

# create numpy array for labels column
labels = data.label.to_numpy()

[[  172  1023   873    70  3614   167  2924     1   102     2]
 [  300    50   220     1  1204  1535 17655     3    74     3]
 [   84   300    26   983    10   788  1697  2532     2  1067]
 [ 2314   121    50   217  3121    33   607  4182   542   825]
 [70690 10390  1021  1859  7387  2190  1248     5    40     1]
 [ 2691     3    10   337     4     2    13 42241     2  2846]
 [  178    14     8    17 12322  1823 55638   106  4989   379]
 [   27   389     2  1276  3908   147    30  2854     1    30]
 [47622   922   330     5   162   722    70     5     1  1242]
 [    8   110  2179  5479  1896     8     4     1   107    74]]


Training and Testing Set

https://pytorch.org/docs/stable/data.html

In [16]:
# Train-Test split parameters
train_size = 0.8  # We will use 80% of the whole data as the training set
val_size = 0.5    # We will use 50% of the remaining data as the validation set

# Make the training set
split_id = int(len(features) * train_size)
train_x, remain_x = features[:split_id], features[split_id:]
train_y, remain_y = labels[:split_id], labels[split_id:]

# Making validation and test sets
split_val_id = int(len(remain_x) * val_size)
val_x, test_x = remain_x[:split_val_id], remain_x[split_val_id:]
val_y, test_y = remain_y[:split_val_id], remain_y[split_val_id:]

# Printing out the shape of the datasets
print('Feature Shapes:')
print('===============')
print('Train set: {}'.format(train_x.shape))
print('Validation set: {}'.format(val_x.shape))
print('Test set: {}'.format(test_x.shape))

# Print the class distribution in each set
print(len(train_y[train_y == 0]), len(train_y[train_y == 1]))
print(len(val_y[val_y == 0]), len(val_y[val_y == 1]))
print(len(test_y[test_y == 0]), len(test_y[test_y == 1]))

Feature Shapes:
Train set: (40000, 256)
Validation set: (5000, 256)
Test set: (5000, 256)
20007 19993
2463 2537
2530 2470


In [17]:
# Defining the batch size
batch_size = 128

# Creating tensor datasets
trainset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
validset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
testset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# Creating data loaders
trainloader = DataLoader(trainset, shuffle=True, batch_size=batch_size)
valloader = DataLoader(validset, shuffle=True, batch_size=batch_size)
testloader = DataLoader(testset, shuffle=True, batch_size=batch_size)

In [18]:
# Check sample batches from the training loader
dataiter = iter(trainloader)
x, y = dataiter.next()

# Print information about the sample batch
print('Sample batch size: ', x.size())   # batch_size, seq_length
print('Sample batch input: \n', x)
print()
print('Sample label size: ', y.size())   # batch_size
print('Sample label input: \n', y)

Sample batch size:  torch.Size([128, 256])
Sample batch input: 
 tensor([[11192,  5667,     2,  ...,   555,   179,    70],
        [ 1479,     1,  1998,  ...,     0,     0,     0],
        [  500,     2,    30,  ...,     0,     0,     0],
        ...,
        [ 1609,    55,    10,  ..., 26205,     1,  4391],
        [ 2270,    17,     4,  ...,     0,     0,     0],
        [  569,   869,  2060,  ...,     1,    30,  2557]])

Sample label size:  torch.Size([128])
Sample label input: 
 tensor([0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1,
        1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
        0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
        0, 1, 1, 1, 1, 1, 0, 1])


# Apply Pruning

https://pytorch.org/tutorials/intermediate/pruning_tutorial.html#sphx-glr-intermediate-pruning-tutorial-py

In [19]:
# define training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [20]:
print(device)

cpu


In [21]:
# Model architecture for sentiment analysis using an LSTM-based neural network
class SentimentModel(nn.Module):
    def __init__(self, vocab_size, output_size, hidden_size=128, embedding_size=400, n_layers=2, dropout=0.2):
        super(SentimentModel, self).__init__()

        # Embedding layer to map input tokens into vector representations
        self.embedding = nn.Embedding(vocab_size, embedding_size)

        # LSTM layer provided by PyTorch library
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, dropout=dropout, batch_first=True)

        # Dropout layer for regularization
        self.dropout = nn.Dropout(0.3)

        # Linear layer for the final output
        self.fc = nn.Linear(hidden_size, output_size)

        # Sigmoid layer as we are performing binary classification
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Convert input features to the 'long' data type
        x = x.long()

        # Map input tokens to vector representations using the embedding layer
        x = self.embedding(x)

        # Pass the input forward through the LSTM layer
        o, _ = self.lstm(x)

        # Retrieve the last output of the sequence
        o = o[:, -1, :]

        # Apply dropout and pass through the fully connected layer
        o = self.dropout(o)
        o = self.fc(o)

        # Apply sigmoid activation for binary classification
        o = self.sigmoid(o)

        return o
# reference used: https://galhever.medium.com/sentiment-analysis-with-pytorch-part-3-cnn-model-7bb30712abd7

In [22]:
# Model hyperparameters
# - `vocab_size`: The size of the vocabulary, representing the number of unique tokens in the input data.
# - `output_size`: The size of the output, typically 1 for binary classification (positive or negative sentiment).
# - `embedding_size`: The dimensionality of the embedding vectors used to represent each token in the input.
# - `hidden_size`: The number of hidden units in the LSTM layer, determining the capacity of the model to capture information.
# - `n_layers`: The number of layers in the LSTM, allowing the model to learn hierarchical features.
# - `dropout`: The dropout rate, a regularization technique applied to prevent overfitting by randomly dropping units during training.
vocab_size = len(word2int)  # Assuming `word2int` is a mapping of words to unique integer indices
output_size = 1
embedding_size = 256
hidden_size = 512
n_layers = 2
dropout = 0.25

# Model initialization
# - Create an instance of the SentimentModel class with the specified hyperparameters.
# - This initializes the neural network with the defined architecture and sets the hyperparameters.
model = SentimentModel(vocab_size, output_size, hidden_size, embedding_size, n_layers, dropout)
print(model)

SentimentModel(
  (embedding): Embedding(120982, 256)
  (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.25)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [23]:
# Apply weight pruning to the Linear layer (fc) of the model
prune.l1_unstructured(model.fc, name='weight', amount=0.2)

Linear(in_features=512, out_features=1, bias=True)

In [24]:
model = model.to(device)

In [25]:
# Training configuration
lr = 0.001
criterion = nn.BCELoss()
optim = Adam(model.parameters(), lr=lr)
grad_clip = 5
epochs = 25
print_every = 1
history = {'train_loss': [], 
           'train_acc': [], 
           'val_loss': [], 
           'val_acc': [], 
           'epochs': epochs}
es_limit = 5

In [None]:
# Train loop
epochloop = tqdm(range(epochs), position=0, desc='Training', leave=True)

# Early stop trigger
es_trigger = 0
val_loss_min = float('inf')

# Training and Validation Loop

for e in epochloop:

    # training mode

    # Set the model to training mode to enable gradient calculation
    model.train()

    train_loss = 0
    train_acc = 0

    # Iterate over training batches
    for id, (feature, target) in enumerate(trainloader):
        # Add epoch meta info
        epochloop.set_postfix_str(f'Training batch {id}/{len(trainloader)}')

        # Move data to the device
        feature, target = feature.to(device), target.to(device)

        # Reset optimizer
        optim.zero_grad()

        # Forward pass
        out = model(feature)

        # Calculate accuracy
        predicted = torch.tensor([1 if i == True else 0 for i in out > 0.5], device=device)
        equals = predicted == target
        acc = torch.mean(equals.type(torch.FloatTensor))
        train_acc += acc.item()

        # Calculate loss and perform backpropagation
        loss = criterion(out.squeeze(), target.float())
        train_loss += loss.item()
        loss.backward()

        # Clip gradients to prevent exploding gradients
        nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

        # Update optimizer
        optim.step()

        # Free some memory
        del feature, target, predicted

    # Store training metrics
    history['train_loss'].append(train_loss / len(trainloader))
    history['train_acc'].append(train_acc / len(trainloader))

    
    # validation mode
    
    # Set the model to evaluation mode to disable gradient calculation
    model.eval()

    val_loss = 0
    val_acc = 0

    with torch.no_grad():
        # Iterate over validation batches
        for id, (feature, target) in enumerate(valloader):
            # Add epoch meta info
            epochloop.set_postfix_str(f'Validation batch {id}/{len(valloader)}')
            
            # Move data to the device
            feature, target = feature.to(device), target.to(device)

            # Forward pass
            out = model(feature)

            # Calculate accuracy
            predicted = torch.tensor([1 if i == True else 0 for i in out > 0.5], device=device)
            equals = predicted == target
            acc = torch.mean(equals.type(torch.FloatTensor))
            val_acc += acc.item()

            # Calculate loss
            loss = criterion(out.squeeze(), target.float())
            val_loss += loss.item()

            # Free some memory
            del feature, target, predicted

        # Store validation metrics
        history['val_loss'].append(val_loss / len(valloader))
        history['val_acc'].append(val_acc / len(valloader))

    # Reset model to training mode
    model.train()

    # Add epoch meta info
    epochloop.set_postfix_str(f'Val Loss: {val_loss / len(valloader):.3f} | Val Acc: {val_acc / len(valloader):.3f}')

    # Print epoch information
    if (e+1) % print_every == 0:
        epochloop.write(f'Epoch {e+1}/{epochs} | Train Loss: {train_loss / len(trainloader):.3f} Train Acc: {train_acc / len(trainloader):.3f} | Val Loss: {val_loss / len(valloader):.3f} Val Acc: {val_acc / len(valloader):.3f}')
        epochloop.update()

    # Save model if validation loss decreases
    if val_loss / len(valloader) <= val_loss_min:
        torch.save(model.state_dict(), './sentiment_lstm.pt')
        val_loss_min = val_loss / len(valloader)
        es_trigger = 0
    else:
        epochloop.write(f'[WARNING] Validation loss did not improve ({val_loss_min:.3f} --> {val_loss / len(valloader):.3f})')
        es_trigger += 1

    # Force early stop
    if es_trigger >= es_limit:
        epochloop.write(f'Early stopped at Epoch-{e+1}')
        # Update epochs history
        history['epochs'] = e+1
        break


Training:   0%|                    | 0/25 [43:55<?, ?it/s, Training batch 7/313]