# Mohit Yadav
## yadav171@umn.edu
## CSCI 5541 HW1

In [5]:
## Importing packages.
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import re
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sentencepiece as spm

import torchtext
torchtext.disable_torchtext_deprecation_warning()

OSError: /home/rpmdt05/mambaforge/envs/mohit-new-nlp/lib/python3.8/site-packages/torchtext/lib/libtorchtext.so: undefined symbol: _ZN5torch3jit17parseSchemaOrNameERKSs

In [2]:
## Set device to Cuda if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
## Download IMDB data.
from torchtext.datasets import IMDB

train_iter, test_iter = IMDB(split=('train', 'test'))

We see a lot of html in the text, and there are symbols which don't have meaning for our work. So we will need to clean the data before using.

In [4]:
## Convert the data to a pandas df for easy processing.
train_df = pd.DataFrame(train_iter, columns=['label', 'text'])
test_df = pd.DataFrame(test_iter, columns=['label', 'text'])

Visualize the data.

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
## See the text stored in the dataframe.
print(train_df['text'][0])

There are a lot of symbols and html text in the sample that needs to be removed.

In [None]:
#Special thanks to https://www.kaggle.com/tanulsingh077 for this function
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [9]:
## Clean the text and store in a txt file for training the tokenizer.
with open("cleaned_text.txt", "w") as f:
    for label, text in train_iter:
        f.write(clean_text(text).strip() + "\n")

In [None]:
### BPE Tokenization.
## Create spm model with our data and vocab size of 8000.
VOCAB_SIZE = 8000
spm.SentencePieceTrainer.train(input='./cleaned_text.txt', model_prefix='bpe_model', vocab_size=VOCAB_SIZE, model_type='bpe')

sp = spm.SentencePieceProcessor(model_file='bpe_model.model')

## Tokenization functions.
def tokenize_into_str(text):
    return " ".join(sp.encode(text, out_type=str))

def tokenize_into_idx(text):
    return " ".join([str(tok) for tok in sp.encode(text)])

In [None]:
## Add cleaned text to dataframes.
train_df['cleaned_text'] = train_df['text'].apply(clean_text)
train_df['tokenized_text'] = train_df['cleaned_text'].apply(tokenize_into_str)
train_df['tokenized_idx'] = train_df['cleaned_text'].apply(tokenize_into_idx)

test_df['cleaned_text'] = test_df['text'].apply(clean_text)
test_df['tokenized_text'] = test_df['cleaned_text'].apply(tokenize_into_str)
test_df['tokenized_idx'] = test_df['cleaned_text'].apply(tokenize_into_idx)

In [12]:
## Clean the dataframe for idx and nan values.
train_df.replace({"tokenized_idx": ""}, np.nan, inplace=True)
train_df.dropna(subset=['tokenized_idx'], inplace=True)

test_df.replace({"tokenized_idx": ""}, np.nan, inplace=True)
test_df.dropna(subset=['tokenized_idx'], inplace=True)

In [13]:
## Add the size of tokens to df, will be used by model.
def count_tokens(text):
    return len(text.split())

train_df['num_tokens'] = train_df['tokenized_idx'].apply(count_tokens)
test_df['num_tokens'] = test_df['tokenized_idx'].apply(count_tokens)

In [None]:
## See sample of data.
train_df.head()

In [None]:
test_df.head()

In [None]:
## Stats of number of tokens in data.
print(f"Train:\t mean={train_df['num_tokens'].mean():.2f}, std={train_df['num_tokens'].std():.2f}, max={train_df['num_tokens'].max():.2f}, min={train_df['num_tokens'].min():.2f}")
print(f"Test:\t mean={test_df['num_tokens'].mean():.2f}, std={test_df['num_tokens'].std():.2f}, max={test_df['num_tokens'].max():.2f}, min={test_df['num_tokens'].min():.2f}")

There is a lot of variation in token size as seen from the high std values.

In [17]:
## Remove non relevant data from the dfs for training.
train_df = train_df[['tokenized_idx', 'num_tokens', 'label']]
test_df = test_df[['tokenized_idx', 'num_tokens', 'label']]

In [18]:
## Make the labels 0 and 1.
train_df.loc[:,'label'] = train_df['label'] - 1
test_df.loc[:,'label'] = test_df['label'] - 1

In [None]:
## Check to see appropriate conversion.
print("Unique Labels in train data: ",train_df['label'].unique())
print("Unique Labels in test data: ",test_df['label'].unique())

In [None]:
train_df.head()

In [None]:
test_df.head()

In [22]:
## Create the DataLoader Class,
## I am not reading from disk as all the data was already loaded into RAM.
class imdbDataset(Dataset):
    def __init__(self, data):
        self.dataset = data

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text_indices = [int(token_idx) for token_idx in self.dataset.iloc[idx]['tokenized_idx'].split()]
        number_of_tokens = self.dataset.iloc[idx]['num_tokens']
        label = self.dataset.iloc[idx]['label']
        return text_indices, number_of_tokens, label

In [23]:
## Define hyperparameters
BATCH_SIZE = 32
EMBED_DIM = 32
NUM_CLASS = 2

## Set Seed
torch.manual_seed(33)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [24]:
### Creating a Single layer model.
class SLMLP(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding_sum = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()
    
    def init_weights(self):
        initrange = 0.5
        self.embedding_sum.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
    
    def forward(self, text, offsets):
        return self.fc(self.embedding_sum(text, offsets))

In [25]:
## Create the multi layer model.
class MLP(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_class):
        super().__init__()
        self.embedding_sum = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc1 = nn.Linear(embed_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_class)
        self.relu = nn.ReLU()
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding_sum.weight.data.uniform_(-initrange, initrange)
        self.fc1.weight.data.uniform_(-initrange, initrange)
        self.fc2.weight.data.uniform_(-initrange, initrange)
        self.fc1.bias.data.zero_()
        self.fc2.bias.data.zero_()

    def forward(self, text, offsets):
        x = self.embedding_sum(text, offsets)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

In [26]:
## Create function to generate data in a batch.
def generate_batch(batch):
    batch_indices = []
    batch_labels = []
    offsets = [0]

    for text_indices, number_of_tokens, label in batch:
        batch_indices.extend(text_indices)
        batch_labels.append(label)
        offsets.append(number_of_tokens)

    batch_indices = torch.tensor(batch_indices, dtype=torch.long)
    batch_labels = torch.tensor(batch_labels, dtype=torch.long)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    return batch_indices, batch_labels, offsets

In [27]:
## Define DataLoaders.
train_loader = DataLoader(imdbDataset(train_df), batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
test_loader = DataLoader(imdbDataset(test_df), batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)

In [28]:
## Define Training Function
def train_func(data_loader, model, criterion, optimizer, scheduler):
    train_loss = 0
    train_acc = 0
    total_size = 0
    for i, (text_indices, label, number_of_tokens) in enumerate(data_loader):
        total_size += len(label)
        optimizer.zero_grad()
        text_indices, number_of_tokens, label = text_indices.to(device), number_of_tokens.to(device), label.to(device)

        # Forward pass.
        model_output = model(text_indices, number_of_tokens)

        ## Compute loss and accuracy.
        loss = criterion(model_output, label)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (model_output.argmax(dim=1) == label).sum().item()
    scheduler.step()
    return train_loss / total_size, train_acc / total_size


## Define Validation Function, can be used on Test data too without any modification.
def val_func(data_loader, model, criterion):
    val_loss = 0
    val_acc = 0
    total_size = 0
    with torch.no_grad():
        for i, (text_indices, label, number_of_tokens) in enumerate(data_loader):
            total_size += len(label)
            text_indices, number_of_tokens, label = text_indices.to(device), number_of_tokens.to(device), label.to(device)

            # Forward pass.
            model_output = model(text_indices, number_of_tokens)

            ## Compute loss and accuracy.
            loss = criterion(model_output, label)
            val_loss += loss.item()
            val_acc += (model_output.argmax(dim=1) == label).sum().item()
    return val_loss / total_size, val_acc / total_size


## Single Layer MLP.

In [29]:
## Define model and training parameters.
modelSLMLP = SLMLP(VOCAB_SIZE, EMBED_DIM, NUM_CLASS)

## Define Training parameters
criterionSLMLP = nn.CrossEntropyLoss()
optimizerSLMLP = torch.optim.SGD(modelSLMLP.parameters(), lr=1.0)
schedulerSLMLP = torch.optim.lr_scheduler.StepLR(optimizerSLMLP, 1, gamma=0.9)

### Train the Single layer model.

In [None]:
### Training Loop for single layer model.
N_EPOCHS = 31

lossSLMLP = []
accuracySLMLP = []
test_accuracySLMLP = []

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_func(train_loader, modelSLMLP, criterionSLMLP, optimizerSLMLP, schedulerSLMLP)
    test_loss, test_acc = val_func(test_loader, modelSLMLP, criterionSLMLP)

    print(f'Epoch {epoch + 1} \tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Test. Acc: {test_acc*100:.2f}%')

    lossSLMLP.append(train_loss)
    accuracySLMLP.append(train_acc)
    test_accuracySLMLP.append(test_acc)

print(f"\nFinished Training, final accuracy on test data is : {test_accuracySLMLP[-1]*100:.2f}%")

In [None]:
## Plot the loss and accuracy
plt.xlabel("Epoch")
plt.ylabel("Normalized measure of loss/accuracy")
x_len = list(range(len(accuracySLMLP)))

plt.axis([0,max(x_len),0,1])
plt.title("Result from Single Layer MLP")
lossSLMLP = np.asarray(lossSLMLP)/max(lossSLMLP)
plt.plot(x_len, lossSLMLP, 'r', label='Training loss')
plt.plot(x_len, accuracySLMLP, 'b', label='Training acc')
plt.plot(x_len, test_accuracySLMLP, 'y', label='Test acc')
plt.grid(True)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.2)
plt.text(31, 0.7, f"Test Accuracy: {test_accuracySLMLP[-1]*100:.2f}%")
plt.show()


# Multi-Layer Perceptron with intermediate layer of size 100.

In [36]:
HIDDEN_SIZE = 100
modelMLP = MLP(VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE, NUM_CLASS).to(device)

## Define Training parameters
criterionMLP = nn.CrossEntropyLoss()
optimizerMLP = torch.optim.SGD(modelMLP.parameters(), lr=1.0)
schedulerMLP = torch.optim.lr_scheduler.StepLR(optimizerMLP, 1, gamma=0.9)

In [None]:
### Training Loop for MLP with 100 dim. of intermedialte layer.

lossMLP = []
accuracyMLP = []
test_accuracyMLP = []

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_func(train_loader, modelMLP, criterionMLP, optimizerMLP, schedulerMLP)
    test_loss, test_acc = val_func(test_loader, modelMLP, criterionMLP)

    print(f'Epoch {epoch + 1} \tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Test. Acc: {test_acc*100:.2f}%')

    lossMLP.append(train_loss)
    accuracyMLP.append(train_acc)
    test_accuracyMLP.append(test_acc)
print(f"\nFinished Training, final accuracy on test data is : {test_accuracyMLP[-1]*100:.2f}%")

In [None]:
## Plot the loss and accuracy
plt.xlabel("Epoch")
plt.ylabel("Normalized measure of loss/accuracy")
x_len = list(range(len(accuracyMLP)))

plt.axis([0,max(x_len),0,1])
plt.title("Result from MLP with intermediate layer size 100")
lossMLP = np.asarray(lossMLP)/max(lossMLP)
plt.plot(x_len, lossMLP, 'r', label='Training loss')
plt.plot(x_len, accuracyMLP, 'b', label='Training acc')
plt.plot(x_len, test_accuracyMLP, 'y', label='Test acc')
plt.grid(True)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.2)
plt.text(31, 0.7, f"Test Accuracy: {test_accuracyMLP[-1]*100:.2f}%")
plt.show()


## Comparision between two-layer MLP to a single layer MLP.

In [None]:
## Comparision between two-layer MLP to a single layer MLP.

plt.xlabel("Epoch")
plt.ylabel("Normalized measure of loss/accuracy")
x_len = list(range(len(accuracyMLP)))

plt.axis([0,max(x_len),0,1])
plt.title("Comparison between two-layer MLP to a single layer MLP")
plt.plot(x_len, test_accuracySLMLP, 'b', label='Test Accuracy of Single Layer MLP')
plt.plot(x_len, test_accuracyMLP, 'g', label='Test Accuracy of Two Layer MLP')
plt.grid(True)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.2)
plt.text(31, 0.7, f"Test Accuracy Single Layer MLP: {test_accuracySLMLP[-1]*100:.2f}%\n\n Test Accuracy Two Layer MLP: {test_accuracyMLP[-1]*100:.2f}%")
plt.show()


From the plot, we can see that the test accuracy for the two-layer MLP is slightly lower than that for the single-layer MLP, possibly due to overfitting on the test data. We observe that with the two-layer MLP, the training loss decreases more than it does for the single-layer MLP, but this improvement doesn't carry over to the validation and test data, indicating a problem of overfitting. Since the multi-layer model has significantly more parameters, it attempts to overfit the training data to minimize the training loss as much as possible.

# Multi-Layer Perceptron with intermediate layer of size 200.

In [40]:
HIDDEN_SIZE2 = 200
modelMLP2 = MLP(VOCAB_SIZE, EMBED_DIM, HIDDEN_SIZE2, NUM_CLASS).to(device)

## Define Training parameters
criterionMLP2 = nn.CrossEntropyLoss()
optimizerMLP2 = torch.optim.SGD(modelMLP2.parameters(), lr=1.0)
schedulerMLP2 = torch.optim.lr_scheduler.StepLR(optimizerMLP2, 1, gamma=0.9)

In [None]:
### Training Loop for MLP with intermediate layer size 200.

lossMLP2 = []
accuracyMLP2 = []
test_accuracyMLP2 = []

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_func(train_loader, modelMLP2, criterionMLP2, optimizerMLP2, schedulerMLP2)
    test_loss, test_acc = val_func(test_loader, modelMLP2, criterionMLP2)

    print(f'Epoch {epoch + 1} \tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Test. Acc: {test_acc*100:.2f}%')

    lossMLP2.append(train_loss)
    accuracyMLP2.append(train_acc)
    test_accuracyMLP2.append(test_acc)
print(f"\nFinished Training, final accuracy on test data is : {test_accuracyMLP2[-1]*100:.2f}%")

In [None]:
## Plot the loss and accuracy
plt.xlabel("Epoch")
plt.ylabel("Normalized measure of loss/accuracy")
x_len = list(range(len(accuracyMLP2)))

plt.axis([0,max(x_len),0,1])
plt.title("Result from MLP with intermediate layer size 200")
lossMLP2 = np.asarray(lossMLP2)/max(lossMLP2)
plt.plot(x_len, lossMLP2, 'r', label='Training loss')
plt.plot(x_len, accuracyMLP2, 'b', label='Training acc')
plt.plot(x_len, test_accuracyMLP2, 'y', label='Test acc')
plt.grid(True)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.2)
plt.text(31, 0.7, f"Test Accuracy: {test_accuracyMLP2[-1]*100:.2f}%")
plt.show()


In [None]:
## Comparision between MLP with intermediate layer size 100 and 200.

plt.xlabel("Epoch")
plt.ylabel("Normalized measure of loss/accuracy")
x_len = list(range(len(accuracyMLP2)))

plt.axis([0,max(x_len),0,1])
plt.title("Comparison between MLP with intermediate layer size 100 and 200")
plt.plot(x_len, test_accuracyMLP, 'b', label='Test Accuracy of MLP with intermediate layer size 100')
plt.plot(x_len, test_accuracyMLP2, 'g', label='Test Accuracy of MLP with intermediate layer size 200')
plt.grid(True)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.2)
plt.text(31, 0.7, f"Test Accuracy of MLP with intermediate layer size 100: {test_accuracyMLP[-1]*100:.2f}%\n\n Test Accuracy MLP with intermediate layer size 200: {test_accuracyMLP2[-1]*100:.2f}%")
plt.show()

We observe a similar trend as in the comparison between the single-layer MLP and the dual-layer MLP. The MLP with an intermediate layer size of 200 fluctuated more than the one with a size of 100, possibly due to overfitting. Since the model with a 200-sized intermediate layer has greater flexibility to fit the training set, it attempts to reduce training errors excessively, leading to overfitting. That being said, there is no significant difference in the final test accuracies between the models with intermediate sizes of 100 and 200.

## Description of the task, dataset, and hardware used.

The task in this assingment was to classify the review of movies into positive or negetive using a Bag of Words approach with Multi layer perceptrons.
We used IMDB dataset from torchtext library.
For training the model I used my personal M1 macbook air without any GPU. As the models were light I was able to train them on my local machanice in less that 10 mins each.

## Test Accuracies

The models achieved following test accuracies on the dataset after training.
Single layer MLP:
Double Layer MPL with intermediate layer of size 100: 
Double Layer MPL with intermediate layer of size 200: 

## Hyperparameters

I have declared all the hyperparameters as global variables, they are listed below too with values.

BATCH_SIZE = 32            ## Tuned to make the model run fast on my local machine.

EMBED_DIM = 32             ## Kept same as in the tutorial

VOCAB_SIZE = 8000          ## Kept same as in the tutorial

HIDDEN_SIZE = 100 or 200   ## as given in problem statement.

N_EPOCHS = 26              ## Manually tuned, as I observed that training was almost stagnent after 25 epochs.

Learning Rate, lr=1.0      ## Kept same as in the tutorial

Scheduler Gamma, gamma = 0.9    ## Kept same as in the tutorial

## Error Analysis on the test set samples.

In [44]:
## make a function to get 5 incorrect predictions.
def get_two_incorrect_predictions(data, model):
    incorrect_predictions = []
    for i, (text_indices, label, number_of_tokens) in enumerate(data):
        predictions = model(text_indices, number_of_tokens)
        for i, prediction in enumerate(predictions):
            if prediction.argmax() != label[i]:
                if i == len(predictions) - 1:
                    incorrect_predictions.append((text_indices[number_of_tokens[i]:], label[i], prediction.argmax()))
                else:
                    incorrect_predictions.append((text_indices[number_of_tokens[i]:number_of_tokens[i+1]], label[i], prediction.argmax()))
                if len(incorrect_predictions) == 5:
                    return incorrect_predictions
    return incorrect_predictions


In [45]:
## Get two incorrect predictions for all the three models.

incorrect_predictionsSLMLP = get_two_incorrect_predictions(test_loader, modelSLMLP)
incorrect_predictionsMLP = get_two_incorrect_predictions(test_loader, modelMLP)
incorrect_predictionsMLP2 = get_two_incorrect_predictions(test_loader, modelMLP2)

incorrect_predictions = incorrect_predictionsSLMLP + incorrect_predictionsMLP + incorrect_predictionsMLP2

In [None]:
## Convert the incorrect predictions to text and print them out.
print(" '0' means negative and '1' means positive.")
for text_indice, actual_label, predicted_label in incorrect_predictions:
    decoded_text = sp.decode(text_indice.tolist())
    print("Actual Label: {} | Predicted Label: {} | Text: {}".format(actual_label, predicted_label, decoded_text))

### Analysis of samples with wrong predictions.

The samples with incorrect predictions seem a bit unclear to me as well. For instance, in one sample, the user spends a long time discussing all the good aspects of the movie at the beginning, only pointing out the negative parts at the very end. This might explain why the model predicted it as a positive review.

Some labels seem ambiguous, as users might write mostly good things about the movie, but the label is still negative, as seen in some samples.

A common observation is that many reviews with incorrect predictions describe the movie in great detail rather than focusing on the actual user opinion, which could be the source of the error. A movie’s storyline might include various negative words, but the movie could still be good, or vice-versa.