# Teacher's Assignment - Extra Credit #2

***Author:*** *Ofir Paz* $\qquad$ ***Version:*** *22.07.2024* $\qquad$ ***Course:*** *22961 - Deep Learning* \
***Extra Assignment Course:*** *20999 - Extra Assignment 4*

Welcome to the second question of the extra assignment #2 as part of the course *Deep Learning*. \
In this question we will train an auto encoder to denoise a language dataset and afterwards use transfer learning on the trained model for a classification task.

## Imports

In [1]:
import random
import torch  # pytorch.
import torch.nn as nn  # neural network module.
import torch.nn.functional as F  # neural network functional module.
from torch.utils.data import DataLoader, Dataset  # data handling.
import torchtext; torchtext.disable_torchtext_deprecation_warning()
from torchtext.vocab import build_vocab_from_iterator  # vocabulary builder.
import matplotlib.pyplot as plt  # plotting module.
import datasets as ds  # public dataset module.
from base_model import BaseModel  # base model class.

# Type hinting.
from torch import Tensor
from torchtext.vocab import Vocab
from typing import Tuple

## Adding Noise

To add noise to a language dataset, I thought of Four options:
1. Make duplicates of random words that appear in the sentence. For example: 
$$\text{"The princess is beautiful"} \rightarrow \text{"The princess is is beautiful"}$$
2. Add a random word somewhere in the sentence. For example:
$$\text{"The princess is beautiful"} \rightarrow \text{"The house princess is beautiful"}$$
3. Changing the order of words in the sentence. For example:
$$\text{"The princess is beautiful"} \rightarrow \text{"The beautiful is princess"}$$
4. Changing a random token in the sentence to the unknown token. For example:
$$\text{"The princess is beautiful"} \rightarrow \text{"The <unk> is beautiful"}$$

The simplest option and the one that seems like it would work best in language processing is option 4, so I will implement that only.

In [2]:
def add_noise(sentence_tokens: Tensor, vocab: Vocab, noise_term: float = 0.1) -> Tensor:
    """
    Add noise to the sentence tokens.

    Args:
        sentence_tokens (Tensor): Sentence tokens.
        vocab (Vocab): Vocabulary.
        noise_term (float): Noise term.

    Returns:
        Tensor: Noisy sentence tokens.
    """
    return sentence_tokens.clone().detach().apply_(lambda token: token if torch.rand(1) > noise_term 
                                                   else vocab["<unk>"])

## The Denoiser Architecture

In [3]:
class DenoiserAutoEncoder(BaseModel):
    def __init__(self, vocab: Vocab, embed_dim: int, hidden_dim: int, 
                 encoder_num_layers: int = 1, **kwargs) -> None:
        super().__init__(**kwargs)
        self.encoder = DenoiserEncoder(len(vocab), embed_dim, hidden_dim, encoder_num_layers)
        self.decoder = DenoiserDecoder(vocab, embed_dim, hidden_dim)

    def forward(self, tokens: Tensor) -> Tensor:
        if len(tokens.size()) == 1:
            tokens = tokens.unsqueeze(0)
        elif tokens.size(0) > 1:
            raise ValueError("Can only process one sentence at a time.")
        context = self.encoder(tokens)
        output = self.decoder(context, tokens.size(1))
        return output


class DenoiserEncoder(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, hidden_dim: int, num_layers: int = 1) -> None:
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.RNN_stack = nn.LSTM(embed_dim, hidden_dim // 2, num_layers, batch_first=True, bidirectional=True)

    def forward(self, tokens: Tensor) -> Tensor:
        embedded = self.embedding(tokens)
        output, _ = self.RNN_stack(embedded)
        context = output[:, -1, :]
        return context


class DenoiserDecoder(nn.Module):
    def __init__(self, vocab: Vocab, embed_dim: int, hidden_dim: int) -> None:
        super().__init__()
        self.vocab = vocab
        self.embedding = nn.Embedding(len(vocab), embed_dim)
        self.RNNCell = DecoderRNN(len(vocab), embed_dim, hidden_dim)

    def forward(self, context: Tensor, num_tokens: int) -> Tensor:
        self.RNNCell.hidden_state = context
        output = []
        previous_token = torch.zeros(self.vocab["<sos>"], dtype=torch.long, 
                                     device=self.embedding.weight.device)  # Default
        for _ in range(num_tokens):
            embedded_token = self.embedding(previous_token)
            logits = self.RNNCell(embedded_token)
            previous_token = torch.argmax(logits, dim=1)
            output.append(logits.squeeze(0))
        return torch.stack(output)


class DecoderRNN(nn.Module):
    def __init__(self, vocab_size: int, embed_dim: int, hidden_dim: int) -> None:
        super().__init__()
        self.hidden_state = torch.zeros(1, hidden_dim)  # Assume batch size of 1.
        self.RNN_cell = nn.RNNCell(embed_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, embedded_token: Tensor) -> Tensor:
        self.hidden_state = self.RNN_cell(embedded_token, self.hidden_state)
        output = self.fc(self.hidden_state)
        return output

## Loading & Pre-Processing

In [4]:
# Load a dataset to try to fit on.
full_dataset: ds.DatasetDict = ds.load_dataset("glue", "sst2", keep_in_memory=True)  # type: ignore

big_train_dataset = full_dataset["train"]
big_validation_dataset = full_dataset["validation"]
train_dataset = big_train_dataset.select(range(500))  # small dataset for testing.
validation_dataset = big_validation_dataset.select(range(100))  # small dataset for testing.

In [5]:
# Create the vocabulary.
train_sentence_list = train_dataset["sentence"]
vocab = build_vocab_from_iterator(map(str.split, train_sentence_list), specials=["<unk>", "<sos>"])
vocab.set_default_index(vocab["<unk>"])

Now, after loading the datasets and creating the vocabulary we can show examples of adding noise to the data.

In [6]:
def get_random_normal_and_noisy_sentence(dataset: ds.Dataset, vocab: Vocab) -> Tuple[str, str]:
    random_sentence = random.choice(dataset)["sentence"]
    random_sentence_tokens = torch.tensor(vocab(random_sentence.split()), dtype=torch.long)
    random_sentence_noisy_tokens = list(add_noise(random_sentence_tokens, vocab))
    random_sentence_noisy = " ".join(vocab.lookup_tokens(random_sentence_noisy_tokens))

    return random_sentence, random_sentence_noisy

random_sentence, random_sentence_noisy = get_random_normal_and_noisy_sentence(train_dataset, vocab)
print(f"Original sentence: \"{random_sentence}\"")
print(f"Noisy sentence: \"{random_sentence_noisy}\"")  

Original sentence: "the package in which this fascinating -- and timely -- content comes wrapped is disappointingly generic . "
Noisy sentence: "the package <unk> which this fascinating -- and timely -- content comes wrapped is disappointingly generic ."


In [7]:
# Create the SST-2 dataset.
class NoisySST2Dataset(Dataset):
    def __init__(self, dataset: ds.Dataset, vocab: Vocab, noise_term: float = 0.1) -> None:
        self.sentences = list(map(lambda seq: torch.tensor(vocab(seq.split()), dtype=torch.long), 
                                  dataset["sentence"]))
        self.vocab = vocab
        self.noise_term = noise_term

    def __len__(self) -> int:
        return len(self.sentences)

    def __getitem__(self, idx) -> Tuple[Tensor, Tensor]:
        return add_noise(self.sentences[idx], self.vocab, self.noise_term), self.sentences[idx]

In [8]:
# Create the dataloaders.
train_set = NoisySST2Dataset(train_dataset, vocab)
validation_set = NoisySST2Dataset(validation_dataset, vocab)

train_loader = DataLoader(train_set, batch_size=1, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=1, shuffle=False)

In [9]:
# Create the model and train the denoiser autoencoder.
denoiser = DenoiserAutoEncoder(vocab, 64, 128, 1, job_type="single-sentence-autoencoder")
denoiser.fit(train_loader, validation_loader, num_epochs=7, lr=0.001)
denoiser.fit(train_loader, validation_loader, num_epochs=6, lr=0.0001)

Using CUDA for training.
[epoch: 01/07] [Train loss: 7.176482  Train Accuracy: 0.039]  [Val loss: 7.165845  Val Accuracy: 0.042]
[epoch: 02/07] [Train loss: 6.432540  Train Accuracy: 0.055]  [Val loss: 7.144907  Val Accuracy: 0.042]
[epoch: 03/07] [Train loss: 6.054338  Train Accuracy: 0.063]  [Val loss: 7.339632  Val Accuracy: 0.058]
[epoch: 04/07] [Train loss: 5.726097  Train Accuracy: 0.078]  [Val loss: 7.497063  Val Accuracy: 0.054]
[epoch: 05/07] [Train loss: 5.408067  Train Accuracy: 0.094]  [Val loss: 7.604751  Val Accuracy: 0.077]
[epoch: 06/07] [Train loss: 5.106171  Train Accuracy: 0.118]  [Val loss: 7.662863  Val Accuracy: 0.077]
[epoch: 07/07] [Train loss: 4.809315  Train Accuracy: 0.147]  [Val loss: 7.881088  Val Accuracy: 0.064]
Using CUDA for training.
[epoch: 08/13] [Train loss: 4.343102  Train Accuracy: 0.212]  [Val loss: 7.638634  Val Accuracy: 0.075]
[epoch: 09/13] [Train loss: 4.223228  Train Accuracy: 0.227]  [Val loss: 7.681630  Val Accuracy: 0.077]
[epoch: 10/13]

In [10]:
# Example of using the model.
noisy_sentence, sentence = train_set[3]

# Denoise the sentence.
with torch.no_grad():
    denoiser.eval()
    denoised_sentence_logits = denoiser(noisy_sentence.unsqueeze(0))
    denoised_sentence = torch.argmax(denoised_sentence_logits, dim=1)
denoiser.train()

string_sentence = " ".join(vocab.lookup_tokens(list(sentence)))
string_noisy_sentence = " ".join(vocab.lookup_tokens(list(noisy_sentence)))
string_denoised_sentence = " ".join(vocab.lookup_tokens(list(denoised_sentence)))

# Print the sentences.
print(f"Original sentence:\n\t\"{string_sentence}\"\n")
print(f"Noisy sentence:\n\t\"{string_noisy_sentence}\"\n")
print(f"Denoised sentence:\n\t\"{string_denoised_sentence}\"\n")

Original sentence:
	"remains utterly satisfied to remain the same throughout"

Noisy sentence:
	"remains <unk> satisfied to remain the same throughout"

Denoised sentence:
	"ugly n't and to and . , ."



## Transfer Learning

We will now use the trained model for a classification task. We will take the encoder part of the auto-denoiser, and we will add a linear layer on top of the encoder to classify the data.
We are basically going to classify from the encoder's representation of the data that was learned during the denoising task.

In [11]:
# Create the classification dataset.
class SST2Dataset(Dataset):
    def __init__(self, dataset: ds.Dataset, vocab: Vocab) -> None:
        self.sentences = list(map(lambda seq: torch.tensor(vocab(seq.split())), dataset["sentence"]))
        self.labels = torch.tensor(dataset["label"], dtype=torch.long)

    def __len__(self) -> int:
        return len(self.sentences)

    def __getitem__(self, idx) -> Tuple[Tensor, Tensor]:
        return self.sentences[idx], self.labels[idx]

In [12]:
classification_train_set = SST2Dataset(train_dataset, vocab)
classification_validation_set = SST2Dataset(validation_dataset, vocab)
classification_train_loader = DataLoader(classification_train_set, batch_size=1, shuffle=True)
classification_validation_loader = DataLoader(classification_validation_set, batch_size=1, shuffle=False)

In [13]:
# Create the classification model.
class TransferClassifer(BaseModel):
    def __init__(self, encoder: nn.Module, hidden_dim: int, num_classes: int, **kwargs) -> None:
        super().__init__(**kwargs)
        self.encoder = encoder
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, tokens: Tensor) -> Tensor:
        if len(tokens.size()) == 1:
            tokens = tokens.unsqueeze(0)
        elif tokens.size(0) > 1:
            raise ValueError("Can only process one sentence at a time.")
        self.encoder.eval()
        with torch.no_grad():
            context = self.encoder(tokens)
        output = self.fc(context)
        return output

In [14]:
# Create and train the model.
transfer_classifer = TransferClassifer(denoiser.encoder, 128, 2, job_type="classification")
transfer_classifer.fit(classification_train_loader, classification_validation_loader, 
                         num_epochs=10, lr=0.0025, wd=0.0001, try_cuda=False)
transfer_classifer.fit(classification_train_loader, classification_validation_loader, 
                         num_epochs=5, lr=0.0005, wd=0.0001, try_cuda=False)

Using CPU for training.
[epoch: 01/10] [Train loss: 0.733546  Train Accuracy: 0.480]  [Val loss: 0.762116  Val Accuracy: 0.520]
[epoch: 02/10] [Train loss: 0.696875  Train Accuracy: 0.576]  [Val loss: 0.845932  Val Accuracy: 0.520]
[epoch: 03/10] [Train loss: 0.677201  Train Accuracy: 0.556]  [Val loss: 0.740179  Val Accuracy: 0.490]
[epoch: 04/10] [Train loss: 0.668494  Train Accuracy: 0.612]  [Val loss: 0.888941  Val Accuracy: 0.490]
[epoch: 05/10] [Train loss: 0.660632  Train Accuracy: 0.588]  [Val loss: 0.744503  Val Accuracy: 0.490]
[epoch: 06/10] [Train loss: 0.647558  Train Accuracy: 0.632]  [Val loss: 0.740986  Val Accuracy: 0.450]
[epoch: 07/10] [Train loss: 0.647347  Train Accuracy: 0.626]  [Val loss: 0.723115  Val Accuracy: 0.500]
[epoch: 08/10] [Train loss: 0.638600  Train Accuracy: 0.640]  [Val loss: 0.751004  Val Accuracy: 0.510]
[epoch: 09/10] [Train loss: 0.630565  Train Accuracy: 0.656]  [Val loss: 0.834213  Val Accuracy: 0.480]
[epoch: 10/10] [Train loss: 0.635711  Tr