# Individual Project (MSc) - Methods for Compressing Different Types of Neural Networks

This file has the full code for the project.

## Preparing the Dataset
The dataset was obtained from Kaggle, https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
#mount to google drive account
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#import csv file using pandas
df = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/IP_MSC/IMDB Dataset.csv', encoding='utf-8')

#transformation funciton to change negative values as zero and positive as one
def lbl_transform(label):
    return 1 if label == 'positive' else 0

# progress bar for pandas functions
# reference: https://www.kdnuggets.com/2022/09/progress-bars-python-tqdm-fun-profit.html
# https://towardsdatascience.com/progress-bars-in-python-and-pandas-f81954d33bae
tqdm.pandas()

# apply the transformation function in the Labels column on the IMDb dataset
df['label'] = df['sentiment'].progress_apply(lbl_transform)

# check if the changes have been applied
df.head()

### Cleaning up the Dataset
Need to remove the following things: 
- HTML Marks including square brackets in the text
- Contracted Words
- Extra White Space
- Stemming Words
- Stopwords

References used: 

https://towardsdatascience.com/nlp-building-text-cleanup-and-preprocessing-pipeline-eba4095245a0

https://www.kdnuggets.com/2018/03/text-data-preprocessing-walkthrough-python.html

https://lzone.de/examples/Python%20re.sub

https://medium.com/@yashj302/text-cleaning-using-regex-python-f1dded1ac5bd

In [None]:
# downloading the required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# defining stopwords for English
stopwords = set(stopwords.words('english'))

In [None]:
# removing hyperlinks and URLs
def remove_links(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

# removing HTML tags in text
def remove_html(text):
    return re.sub(r'<[^>]+>', '', text)

# removing regular punctuations
def remove_punctuations(text):
    return re.sub(r'[\"\#\$\%\&\'\(\)\*\+\/\:\;\<\=\>\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)

# add spacing between punctations marks
def spacing4punctuations(text):
    pattern = r'([.,!?-])'
    s = re.sub(pattern, r' \1 ', text)
    s = re.sub(r'\s{2,}', ' ', s)
    return s

# removing any additional white spaces
def remove_spacing(text):
    return re.sub(r' +', ' ', text)

# removing number in text
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# removing NONASCII characters
def remove_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)

# removing emojis from text
def remove_emoji(text):
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'
        u'\U0001F300-\U0001F5FF'
        u'\U0001F680-\U0001F6FF'
        u'\U0001F1E0-\U0001F1FF'
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    return emojis.sub(r'', text)

# removes repeated characters - e.g. 'heeellllooo' will be 'hello'
def auto_correct(text):
    return re.sub(r'(.)\1+', r'\1\1', text)

# tokenize the text
def tokenize(text):
    return word_tokenize(text)

#removing stopwords
def remove_stopwords(text):
    return [i for i in text if i.lower() not in stopwords]

# lemmatize text
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(t) for t in text]
    return remove_stopwords(lemmas)

# function to cleaning up the text with all the function defined above
def txt_cleanup(text):
    no_link = remove_links(text)
    no_html = remove_html(no_link)
    space_punct = spacing4punctuations(no_html)
    no_punct = remove_punctuations(space_punct)
    no_number = remove_numbers(no_punct)
    no_whitespaces = remove_spacing(no_number)
    no_nonasci = remove_nonascii(no_whitespaces)
    no_emoji = remove_emoji(no_nonasci)
    spell_corrected = auto_correct(no_emoji)
    return spell_corrected

# function to preprocess the text
def txt_processing(text):
    tokens = tokenize(text)
    no_stopwords = remove_stopwords(tokens)
    lemmas = lemmatize(no_stopwords)
    return ' '.join(lemmas)

# apply txt_cleanup function on the IMDb dataset
df['clean'] = df['review'].progress_apply(txt_cleanup)

# applying the text preprocessing function on the IMDb dataset
df['preprocessed'] = df['clean'].progress_apply(txt_processing)

# view the changes that were made on the dataset
df.head()

In [None]:
# save new dataframe with preprocessed text as a new csv file
df.to_csv('IMDb_processed.csv', index=False)

# focusing only on the processed text and the labels
# keep only processed and label columns
df[['preprocessed', 'label']].to_csv('./IMDb_processed.csv', index=False, header=True)

## Comprehensive Text Data Processing and Encoding

I have created a pipeline for processing the IMDB text data in this section. It prepares IMDB reviews by combining preprocessed reviews, segmenting the text into words, and building a vocabulary. Words are mapped to integers to encode reviews into numerical representations, and a padding token is introduced. The padding ensures that all reviews have the same length.

In [None]:
# path to your processed CSV file
data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/IP_MSC/IMDb_processed.csv')

# checking the length of the processed reviews
data['review_length']=data['preprocessed'].apply(lambda x: len(x.split()))

# plotting a bar chart to view the review lengths
with plt.style.context(style='bmh'):
    data['review_length'].hist(figsize=(15,5))
    plt.title(label='Feature Length of Reviews in the IMDb dataset')
    plt.show()

In [None]:
# obtaining all processed reviews
processed_reviews = data.preprocessed.values

# merging into single variable, separated by whitespaces
words = ' '.join(processed_reviews)

# obtaining a list of words
words = words.split()

# building a vocabulary and creating mappings between words and integers using a Counter
# counting the occurrences of each word
counter = Counter(words)

# sorting the words by their frequency in descending order
vocab = sorted(counter, key=counter.get, reverse=True)

# Create a mapping from integer to word
convert2integer = dict(enumerate(vocab, 1))

# Adding a special token for padding at index 0
convert2integer[0] = '<PAD>'

# creating a mapping from word to integer
convert2word = {word: id for id, word in convert2integer.items()}

# encoding the words in the reviews text
# by using the word-to-integer mapping to replace words with their integer indices
reviews_encoded = [[convert2word[word] for word in review.split()] for review in tqdm(processed_reviews)]

In [None]:
# Padding sequences function
#processed_reviews do not have the same length so we will need set a max sequence length
def pad_features(processed_reviews, pad_id, sequence_length=128):
    # Creating a matrix to store the padded features
    features = np.full((len(processed_reviews), sequence_length), pad_id, dtype=int)
    # Looping through each review and pad or trim to the specified sequence length
    for i, row in enumerate(processed_reviews):
        # If the review is longer than sequence_length, trim it
        features[i, :len(row)] = np.array(row)[:sequence_length]
    return features

# Set the desired sequence length
sequence_length = 128

# Padding the encoded reviews using the pad_features function
features = pad_features(reviews_encoded, pad_id=convert2word['<PAD>'], sequence_length=sequence_length)

# Assertions to check the dimensions - verify whether the dimensions of the padded features match the expected values
assert len(features) == len(reviews_encoded)
assert len(features[0]) == sequence_length

# printing the first 10 rows and columns for inspection
print(features[:10, :10])

In [None]:
# create numpy array for labels column
labels = data.label.to_numpy()

## Splitting the Dataset to Training and Testing

We divide the dataset into three parts - training, validation, and test sets. 70% data is allocated for training and the remaining 30% for validation and testing. We separate the features and labels accordingly, print their shapes, and display the class distribution within each set to understand how the binary classes are balanced. This process is crucial for reliable model development and assessment.

In [None]:
# setting values for training and testing
train_size = 0.7  # use 70% of the data as the training set
val_size = 0.3    # use 30% of the remaining data as the validation set

# making the training set
# calculating the index to split the features and labels for the training set
split_id = int(len(features) * train_size)
# splitting the features into the training set
X_train, X_remainder = features[:split_id], features[split_id:]
# splitting the labels into the training set
Y_train, Y_remainder = labels[:split_id], labels[split_id:]

# making the validation and testing set
# calculating the index to split the remaining data into validation and test sets
split_val_id = int(len(X_remainder) * val_size)
# splitting the remaining features into validation and test sets
X_validation, X_test = X_remainder[:split_val_id], X_remainder[split_val_id:]
# splitting the remaining labels into validation and test sets
Y_validation, Y_test = Y_remainder[:split_val_id], Y_remainder[split_val_id:]

# Printing out the shape of the datasets
print('Feature Shapes:')
print('===============')
# Print the shape of the training set features
print('Training set: {}'.format(X_train.shape)) 
# Print the shape of the validation set features
print('Validation set: {}'.format(X_validation.shape))
# Print the shape of the test set features
print('Testing set: {}'.format(X_test.shape))

# Print the class distribution in each set
print("Class Distribution in Training Set:")
# Print the count of each class in the training set
print(f"Class 0: {len(Y_train[Y_train == 0])}, Class 1: {len(Y_train[Y_train == 1])}")
# Print the count of each class in the validation set
print("\nClass Distribution in Validation Set:")
print(f"Class 0: {len(Y_validation[Y_validation == 0])}, Class 1: {len(Y_validation[Y_validation == 1])}")
# Print the count of each class in the test set
print("\nClass Distribution in Test Set:")
print(f"Class 0: {len(Y_test[Y_test == 0])}, Class 1: {len(Y_test[Y_test == 1])}")

Now, we have a **batch size of 64** for efficient training and three PyTorch TensorDatasets: **training_set**, **validation_set**, and **testing_set**. Three DataLoader instances initialize with batch size and enable shuffling for training and validation sets. 

The code checks a sample batch from the training loader to provide insights into the  structure of the processed dataset.

In [None]:
# Defining the batch size
batch_size = 64

# Creating tensor datasets
training_set = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(Y_train))
validation_set = TensorDataset(torch.from_numpy(X_validation), torch.from_numpy(Y_validation))
testing_set = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(Y_test))

# Creating data loaders
training_loader = DataLoader(training_set, shuffle=True, batch_size=batch_size)
validation_loader = DataLoader(validation_set, shuffle=True, batch_size=batch_size)
testing_loader = DataLoader(testing_set, shuffle=True, batch_size=batch_size)

# Checking sample batches from the training loader
dataiteration = iter(training_loader)
x, y = next(dataiteration)

# Display details of the sample batch
print('Sample Batch Information:')
print('-------------------------')
print('Input Size: ', x.size())   # Display the size of the input batch (batch_size, seq_length)
print('Input Batch:\n', x)        # Display the input batch
print()
print('Label Size: ', y.size())   # Display the size of the label batch (batch_size)
print('Label Batch:\n', y)        # Display the label batch

## LSTM-based model for Sentiment Analysis



In [None]:
torch.cuda.is_available()
# ref: https://stackoverflow.com/questions/50560395/how-to-install-cuda-in-google-colab-gpus
# Output would be True if Pytorch is using GPU otherwise it would be False.

# define training device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
# LSTM Model architecture for sentiment analysis
# reference used: https://galhever.medium.com/sentiment-analysis-with-pytorch-part-3-cnn-model-7bb30712abd7
class Sentiment_LSTM(nn.Module):
    def __init__(self, vocab_size, output_size, hidden_size=128, embedding_size=400, n_layers=2, dropout=0.2):
        super(Sentiment_LSTM, self).__init__()

        # Embedding layer to map input tokens into vector representations
        self.embedding = nn.Embedding(vocab_size, embedding_size)

        # LSTM layer provided by PyTorch library
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, dropout=dropout, batch_first=True)

        # Dropout layer for regularization
        self.dropout = nn.Dropout(0.3)

        # Linear layer for the final output
        self.fc = nn.Linear(hidden_size, output_size)

        # Sigmoid layer as we are performing binary classification
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        # Convert input features to the 'long' data type
        x = x.long()

        # Map input tokens to vector representations using the embedding layer
        x = self.embedding(x)

        # Pass the input forward through the LSTM layer
        o, _ = self.lstm(x)

        # Retrieve the last output of the sequence
        o = o[:, -1, :]

        # Apply dropout and pass through the fully connected layer
        o = self.dropout(o)
        o = self.fc(o)

        # Apply sigmoid activation for binary classification
        o = self.sigmoid(o)

        return o

In [None]:
# Model hyperparameters
vocab_size = len(convert2word)  # size of the vocabular; assuming `convert2word` is a mapping of words to unique integer indices
output_size = 1 # size of the output, typically 1 for binary classification (positive or negative sentiment)
embedding_size = 256 #dimensionality of embedding vectors used to represent each token in the input
hidden_size = 512 #number of hidden units in the LSTM layer
n_layers = 2 #number of layers in the LSTM model
dropout = 0.25 #dropout rate; regularization technique applied to prevent overfitting by randomly dropping units during training

In [None]:
# Model initialization
model = Sentiment_LSTM(vocab_size, output_size, hidden_size, embedding_size, n_layers, dropout)
print(model)

In [None]:
# Training configuration
lr = 0.001  # Learning rate
criterion = nn.BCELoss()  # Loss function: Binary Cross Entropy Loss for binary classification
optim = Adam(model.parameters(), lr=lr)  # Optimizer: Adam optimizer with the specified learning rate
grad_clip = 5  # Gradient clipping threshold to prevent large gradients
epochs = 10  # Number of times the entire dataset is passed through the model during training
print_every = 1  # Frequency of printing training progress information (every 'print_every' epochs)
history = {
    'training_loss': [],          # List to store training loss over epochs
    'training_accuracy': [],      # List to store training accuracy over epochs
    'validation_loss': [],        # List to store validation loss over epochs
    'validation_accuracy': [],    # List to store validation accuracy over epochs
    'epochs': epochs              # Total number of epochs
}
es_limit = 5  # Early stopping limit; Maximum consecutive epochs without improvement in validation loss

# Training loop

# Move the model to the specified device (GPU or CPU)
model = model.to(device)

# Create a tqdm progress bar over the range of epochs
# - 'position=0': Display the progress bar at the top
# - 'desc='Training'': Description to display in the progress bar
# - 'leave=True': Leave the progress bar displayed after completion
epochloop = tqdm(range(epochs), position=0, desc='Training', leave=True)

# Early stop trigger
es_trigger = 0

# Minimum validation loss initialized to positive infinity
# Used to track the lowest validation loss encountered during training
validation_loss_min = np.Inf

In [None]:
# Training and Validation Loop
# reference used: https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel
for e in epochloop:

    # Training mode
    # Set the model to training mode to enable gradient calculation
    model.train()

    # Initialize training metrics
    training_loss = 0
    training_accuracy = 0

    # Iterate over training batches
    for id, (feature, target) in enumerate(training_loader):
        # Add epoch meta info
        epochloop.set_postfix_str(f'Training batch {id}/{len(training_loader)}')

        # Move data to the specified device (GPU or CPU)
        feature, target = feature.to(device), target.to(device)

        # Reset optimizer gradients
        optim.zero_grad()

        # Forward pass
        out = model(feature)

        # Calculate accuracy
        predicted = torch.tensor([1 if i == True else 0 for i in out > 0.5], device=device)
        equals = predicted == target
        acc = torch.mean(equals.type(torch.FloatTensor))
        training_accuracy += acc.item()

        # Calculate loss and perform backpropagation
        loss = criterion(out.squeeze(), target.float())
        training_loss += loss.item()
        loss.backward()

        # Clip gradients to prevent exploding gradients
        nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

        # Update optimizer
        optim.step()

        del feature, target, predicted

    # Store training metrics
    history['training_loss'].append(training_loss / len(training_loader))
    history['training_accuracy'].append(training_accuracy / len(training_loader))

    # Validation mode
    # Set the model to evaluation mode to disable gradient calculation
    model.eval()

    # Initialize validation metrics
    validation_loss = 0
    validation_accuracy = 0

    with torch.no_grad():
        # Iterate over validation batches
        for id, (feature, target) in enumerate(validation_loader):
            # Add epoch meta info
            epochloop.set_postfix_str(f'Validation batch {id}/{len(validation_loader)}')

            # Move data to the specified device (GPU or CPU)
            feature, target = feature.to(device), target.to(device)

            # Forward pass
            out = model(feature)

            # Calculate accuracy
            predicted = torch.tensor([1 if i == True else 0 for i in out > 0.5], device=device)
            equals = predicted == target
            acc = torch.mean(equals.type(torch.FloatTensor))
            validation_accuracy += acc.item()

            # Calculate loss
            loss = criterion(out.squeeze(), target.float())
            validation_loss += loss.item()

            # Free some memory
            del feature, target, predicted

        # Store validation metrics
        history['validation_loss'].append(validation_loss / len(validation_loader))
        history['validation_accuracy'].append(validation_accuracy / len(validation_loader))

    # Reset model to training mode
    model.train()

    # Add epoch meta info
    epochloop.set_postfix_str(f'Validation Loss: {validation_loss / len(validation_loader):.3f} | Validation Accuracy: {validation_accuracy / len(validation_loader):.3f}')

    # Print epoch information
    if (e+1) % print_every == 0:
        epochloop.write(f'Epoch {e+1}/{epochs} | Training Loss: {training_loss / len(training_loader):.3f} Training Accuracy: {training_accuracy / len(training_loader):.3f} | Val Loss: {validation_loss / len(validation_loader):.3f} Val Acc: {validation_accuracy / len(validation_loader):.3f}')
        epochloop.update()

    # Save model if validation loss decreases
    if validation_loss / len(validation_loader) <= validation_loss_min:
        torch.save(model.state_dict(), './sentiment_lstm.pt')
        validation_loss_min = validation_loss / len(validation_loader)
        es_trigger = 0
    else:
        epochloop.write(f'[WARNING] Validation loss did not improve ({validation_loss_min:.3f} --> {validation_loss / len(validation_loader):.3f})')
        es_trigger += 1

    # Force early stop
    if es_trigger >= es_limit:
        epochloop.write(f'Early stopping at Epoch-{e+1}')
        # Update epochs history
        history['epochs'] = e+1
        break

In [None]:
# Plotting loss over epochs for training and validation accuracy
plt.figure(figsize=(6, 8))
plt.plot(range(history['epochs']), history['training_accuracy'], label='Training Accuracy')
plt.plot(range(history['epochs']), history['validation_accuracy'], label='Validation Accuracy')
plt.legend()
plt.show()

In [None]:
# Predicting data with unseen data which wasn't included in training
# Test loop
model.eval()

# Metrics for testing
test_loss = 0
test_acc = 0

# Lists to store true target values and predicted values for later evaluation
all_target = []
all_predicted = []

# tqdm progress bar for the test loop
testloop = tqdm(testing_loader, leave=True, desc='Inference')
with torch.no_grad():
    for feature, target in testloop:
        # Move data to the specified device (GPU or CPU)
        feature, target = feature.to(device), target.to(device)

        # Forward pass
        out = model(feature)

        # Calculate accuracy
        predicted = torch.tensor([1 if i == True else 0 for i in out > 0.5], device=device)
        equals = predicted == target
        acc = torch.mean(equals.type(torch.FloatTensor))
        test_acc += acc.item()

        # Calculate loss
        loss = criterion(out.squeeze(), target.float())
        test_loss += loss.item()

        # Extend lists with true and predicted values
        all_target.extend(target.cpu().numpy())
        all_predicted.extend(predicted.cpu().numpy())

    # Print overall accuracy and loss for the test set
    print(f'Accuracy: {test_acc/len(testing_loader):.4f}, Loss: {test_loss/len(testing_loader):.4f}')

# Print out classification report
print(classification_report(all_predicted, all_target))

In [None]:
# Plot confusion matrix
cm = confusion_matrix(all_predicted, all_target)
plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.show()