# HW4P1: Language Modelling

Welcome to the final part 1 hw of this course. This is the only part 1 in which you have PyTorch training (Yay). You will be working on training language models and evaluating them on the task of prediction and generation.<br>
The model which you will be coding in this HW very similar to the Speller module from HW4P2.

# Get modules and datasets

In [None]:
!pip install torchsummaryX==1.1.0
!pip install wandb --quiet
!pip install matplotlib

!pip install transformers -U
!pip install tokenizers

In [None]:
# TODO: Import drive if you are using Colab

In [None]:
pwd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
ls drive/MyDrive/IDL.Fall2024/HWs/HW4/HW4P1/handout/hw4p1_handout

In [None]:
pwd

In [None]:
import sys
path = "YOUR_PATH"# "YOUR_PATH" # TODO: Add path to handout. For example D:/IDL/hw4/hw4p1_handout/handout
sys.path.append(path)
%cd {path}

# Imports

In [None]:
%matplotlib inline

import torch

import os

import time
import math
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import torchsummaryX
import torch.nn as nn
import torch.nn.functional as F
import math
import gc
import glob
import wandb
import yaml
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import get_linear_schedule_with_warmup


# Importing necessary modules from hw4
from hw4p1.tests_hw4 import get_prediction_nll, make_generation_text

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device: ", DEVICE)


# Load datasets

In [None]:
# Define the vocabulary. Try printing and see
VOCAB = [
   "<sos>", "<eos>",
    "A",   "B",    "C",    "D",
    "E",   "F",    "G",    "H",
    "I",   "J",    "K",    "L",
    "M",   "N",    "O",    "P",
    "Q",   "R",    "S",    "T",
    "U",   "V",    "W",    "X",
    "Y",   "Z",    "'",    " ", "<pad>"
]

VOCAB_MAP = {VOCAB[i]:i for i in range(0, len(VOCAB))}
# We have also included <sos> and <eos> in the vocabulary for you
# However in real life, you include it explicitly if not provided
PAD_TOKEN =  VOCAB_MAP["<pad>"]
SOS_TOKEN = VOCAB_MAP["<sos>"]
EOS_TOKEN = VOCAB_MAP["<eos>"]

print(f"Length of Vocabulary    : {len(VOCAB)}")
print(f"VOCAB                   : {VOCAB}")
print(f"PAD_TOKEN               : {PAD_TOKEN}")
print(f"SOS_TOKEN               : {SOS_TOKEN}")
print(f"EOS_TOKEN               : {EOS_TOKEN}")




df = pd.read_csv("dataset/train-clean-100/transcripts.csv")

df.head()


In [None]:
# Add start token and end token
transcripts  = []
transcripts = [np.array([i for i in row['transcripts'].replace("<sos>", "").replace("<eos>", "") ]) for index, row in df.iterrows()]
dataset = [[VOCAB_MAP[char] for char in transcript] for transcript in transcripts]

# Configuration

In [None]:
%%writefile config.yaml

###### Dataloader Configuration -------------------------------------------------
batch_size: 64          # Number of samples per batch during training
sequence_length : 500    # Maximum sequence length for input data
shuffle : True           # Whether to shuffle the data at the beginning of each epoch
drop_last : True        # If True, drops the last incomplete batch if dataset size is not divisible by batch_size

###### Model Architecture Configuration ----------------------------------------
weight_initialization : "kaiming_normal"  # Method for weight initialization. Options: 'kaiming_normal' or 'None' for default initialization
ff : 2048                               # Size of the feed-forward hidden layer in Transformer (typically larger than d_model)
d_model : 512                           # Dimensionality of model embeddings (size of input/output in multi-head attention)
num_layers: 2                           # Number of Transformer decoder layers (stacked)
num_heads: 2                            # Number of heads in multi-head attention mechanism
dropout: 0.1                            # Dropout rate applied during training
max_length: 1000                        # Maximum input sequence length (useful for positional encoding)

###### Optimizer Configuration -------------------------------------------------
learning_rate: 0.0005                   # Initial learning rate for the optimizer
optimizer: "AdamW"                       # Optimizer used for training. Options: 'Adam' or 'AdamW'

###### Scheduler Configuration -------------------------------------------------
scheduler: "CosineAnnealing"            # Learning rate scheduler type. Options: 'Cosine', 'ReduceLR', 'CosineAnnealing'

# If 'ReduceLROnPlateau' scheduler is chosen:
factor: 0.5                             # Factor by which the learning rate is reduced when a plateau is encountered (used for 'ReduceLROnPlateau')
patience: 1                             # Number of epochs with no improvement before reducing the learning rate (used for 'ReduceLROnPlateau')

###### Experiment Configuration ------------------------------------------------
num_epochs: 100                         # Total number of epochs for training the model

TA: "Name"                              # Placeholder for the name of the teaching assistant or person running the experiment


In [None]:
import yaml
with open("config.yaml") as file:
    config = yaml.safe_load(file)

In [None]:

config

# Custom DataLoader

In [None]:
class DataLoaderForLanguageModeling(torch.utils.data.DataLoader): # Inherit from torch.utils.data.DataLoader
    """
        TODO: Define data loader logic here
    """
    # TODO: You can probably add more parameters as well. Eg. sequence length
    def __init__(self, dataset, batch_size, sequence_length=3, shuffle= True, drop_last= False):

        # If you remember, these are the standard things which you give while defining a dataloader.
        # Now you are just customizing your dataloader
        self.dataset    = dataset
        self.batch_size = batch_size
        self.shuffle    = shuffle
        self.drop_last  = drop_last
        self.sequence_length = sequence_length

        # TODO: Shuffle data if shuffle is True
        if self.shuffle:
            np.random.shuffle(self.dataset)

        # TODO: Concatenate articles drop extra words that won't fit into a full batch
        self.dataset = np.concatenate(self.dataset)
        self.num_batches =  int(np.ceil(len(self.dataset) / (self.sequence_length * self.batch_size)))

        if self.drop_last:
            self.num_batches -= 1
        else:
            # Pad the last target sequence with EOS_TOKEN to ensure it has the same length as the other target sequences
            pad_width = (self.num_batches * self.sequence_length * self.batch_size) - len(self.dataset) + 1
            self.dataset = np.pad(self.dataset, (0, pad_width), mode='constant', constant_values=PAD_TOKEN)

    def __len__(self):
        # What output do you get when you print len(loader)? You get the number of batches
        # Your dataset has (579, ) articles and each article has a specified amount of words.
        # You concatenate the dataset and then batch parts of it according to the sequence length
        # TODO: return the number of batches
        # If you are using variable sequence_length, the length might not be fixed

        return self.num_batches

    def __iter__(self):

        # TODO: Divide the concetenated dataset into inputs and targets. How do they vary?
        divisible_len = self.num_batches * self.sequence_length * self.batch_size

        input_dataset = self.dataset[:divisible_len]
        target_dataset = self.dataset[1:divisible_len+1]

        # TODO: Reshape the inputs and targets into batches (think about the final shape)
        input_dataset = input_dataset.reshape(self.num_batches, self.batch_size, self.sequence_length)
        target_dataset = target_dataset.reshape(self.num_batches, self.batch_size, self.sequence_length)

        # TODO: Loop though the batches and yield the input and target according to the sequence length
        batch_idx = 0
        while batch_idx < self.num_batches:
            input_batch   = input_dataset[batch_idx,:,:]
            target_batch = target_dataset[batch_idx,:,:]
            batch_idx += 1
            yield input_batch, target_batch


In [None]:
class updated_DataLoaderForLanguageModeling(torch.utils.data.DataLoader): # Inherit from torch.utils.data.DataLoader
    """
        TODO: Define data loader logic here
    """
    # TODO: You can probably add more parameters as well. Eg. sequence length
    def __init__(self, dataset, batch_size, sequence_length=3, shuffle= True, drop_last= False):

        # If you remember, these are the standard things which you give while defining a dataloader.
        # Now you are just customizing your dataloader
        self.dataset    = dataset
        self.batch_size = batch_size
        self.shuffle    = shuffle
        self.drop_last  = drop_last
        self.sequence_length = sequence_length


    def __len__(self):
        # What output do you get when you print len(loader)? You get the number of batches
        # Your dataset has (579, ) articles and each article has a specified amount of words.
        # You concatenate the dataset and then batch parts of it according to the sequence length
        # TODO: return the number of batches
        # If you are using variable sequence_length, the length might not be fixed

        dataset_len = sum([len(i) for i in self.dataset])
        num_batches =  int(np.ceil(dataset_len / (self.sequence_length * self.batch_size)))

        if self.drop_last:
            num_batches -= 1
        return num_batches

    def __iter__(self):
        # TODO: Shuffle data if shuffle is True
        if self.shuffle:
            np.random.shuffle(self.dataset)

        # TODO: Concatenate articles drop extra words that won't fit into a full batch
        self.concatenated_dataset = np.concatenate(self.dataset)
        self.num_batches =  int(np.ceil(len(self.concatenated_dataset) / (self.sequence_length * self.batch_size)))

        if self.drop_last:
            self.num_batches -= 1
        else:
            # Pad the last target sequence with EOS_TOKEN to ensure it has the same length as the other target sequences
            pad_width = (self.num_batches * self.sequence_length * self.batch_size) - len(self.concatenated_dataset) + 1
            self.concatenated_dataset = np.pad(self.concatenated_dataset, (0, pad_width), mode='constant', constant_values=PAD_TOKEN)

        # TODO: Divide the concetenated dataset into inputs and targets. How do they vary?
        divisible_len = self.num_batches * self.sequence_length * self.batch_size

        input_dataset = self.concatenated_dataset[:divisible_len]
        target_dataset = self.concatenated_dataset[:divisible_len]

        # TODO: Reshape the inputs and targets into batches (think about the final shape)
        input_dataset = input_dataset.reshape(self.num_batches, self.batch_size, self.sequence_length)
        target_dataset = target_dataset.reshape(self.num_batches, self.batch_size, self.sequence_length)

        # TODO: Loop though the batches and yield the input and target according to the sequence length
        batch_idx = 0
        while batch_idx < self.num_batches:
            input_batch   = input_dataset[batch_idx,:,:]
            target_batch = target_dataset[batch_idx,:,:]
            batch_idx += 1
            input_batch = np.concatenate([np.zeros((input_batch.shape[0], 1)).astype(np.int64), input_batch], axis=1)
            target_batch = np.concatenate([target_batch, np.ones((target_batch.shape[0], 1)).astype(np.int64)], axis=1)
            yield input_batch, target_batch


In [None]:
dl = DataLoaderForLanguageModeling(
    dataset     = dataset,
    batch_size  = config['batch_size'],
    shuffle     = config['shuffle'],
    drop_last   = config['drop_last'],
    sequence_length=config['sequence_length']
    # Input Extra parameters here if needed
)

In [None]:
# Some sanity checks

inputs, targets = next(iter(dl))
print(inputs.shape, targets.shape)

for x, y in dl:
    print("x: ", [VOCAB[i] for i in x[0, :]])
    print("y: ", [VOCAB[i] for i in y[0, :]])
    break

In [None]:
# Loading the fixtures for validation and test - prediction
fixtures_pred       = np.load('fixtures/prediction.npz')        # validation
fixtures_pred_test  = np.load('fixtures/prediction_test.npz')   # test

print("Validation shapes    : ", fixtures_pred['inp'].shape, fixtures_pred['out'].shape)
print("Test shapes          : ", fixtures_pred_test['inp'].shape)

In [None]:
# Loading the test fixtures for generation
fixtures_gen_test   = np.load('fixtures/generation_test.npz')   # test

print("Test Gen Shapes          :", fixtures_gen_test['inp'].shape)

In [None]:

# Example Prediction Dev Input and Output
# Optional TODO: You can try printing a few samples from the validation set which has both inputs and outputs

# Causal Language Model

Causal language models predict the probability of a word based on the preceding words in the sentence. This differs from bidirectional models, which consider both previous and following context. Here, we use a Transformer-based decoder, leveraging its attention mechanism to focus only on earlier parts of the sequence to predict the next word. This type of modeling is suitable for tasks such as text generation where the sequence order is crucial.


**Link to HuggingFace Documentation**: [Causal Language Model](https://huggingface.co/docs/transformers/en/tasks/language_modeling)

The following image can be a helpful aid in visualizing the flow of information in a causal language model, highlighting how each word in a sequence is used to predict the next word.

<img src="https://github.com/christianversloot/machine-learning-articles/blob/main/images/causal-1024x445.png?raw=true" width="60%">

This figure shows three matrices: the attention scores between sequence elements, the causal mask with zeros allowing attention and negative infinity blocking future attention, and the resultant matrix after applying the causal mask. The negative infinity values in the causal mask prevent the model from using future tokens in its predictions, reinforcing the sequence's order. This visualization shows how transformers can be used for causal language modeling where future input information must not influence current predictions.

<img src="https://github.com/christianversloot/machine-learning-articles/raw/main/images/Diagram-20-1024x282.png" width="80%">


# Masking Functions in Attention

In sequence models, particularly with Transformers, masks are crucial for managing padding and controlling the flow of information during attention. These functions help create masks for different purposes in sequence modeling. Below is a brief explanation of each mask:

## 1. `create_mask_1`: Mask to Identify Non-Padding Positions

```python
def create_mask_1(padded_input, input_lengths=None, pad_idx=None)
```  


## Purpose:
This function generates a mask to identify **non-padding positions** in a padded input sequence. It marks positions with 1 if they are actual data points and 0 if they are padding tokens.

## Usage:
- **Input:**
  - `padded_input`: Tensor of shape `(N, T, ...)` or `(N, T)` where `N` is batch size and `T` is sequence length.
  - `input_lengths`: (Optional) Actual lengths of each sequence before padding.
  - `pad_idx`: (Optional) Index representing the padding token.

- **Output:**
  - A mask of shape `(N, T, 1)` where non-padding positions are marked with 1 and padding positions are marked with 0.




In [None]:
def create_mask_1(padded_input, input_lengths=None, pad_idx=None):
    """ Create a mask to identify non-padding positions.

    Args:
        padded_input: The input tensor with padding, shape (N, T, ...) or (N, T).
        input_lengths: Optional, the actual lengths of each sequence before padding, shape (N,).
        pad_idx: Optional, the index used for padding tokens.

    Returns:
        A mask tensor with shape (N, T, 1), where non-padding positions are marked with 1 and padding positions are marked with 0.
    """

    assert input_lengths is not None or pad_idx is not None

    # Create a mask based on input_lengths
    if input_lengths is not None:
        N = padded_input.size(0)        # padded_input : (N x T x ...)
        non_pad_mask = padded_input.new_ones(padded_input.size()[:-1])  # (N x T)

        # Set the mask to 0 for padding positions or pad_idx
        for i in range(N):

            non_pad_mask[i, input_lengths[i]:] = 0 if pad_idx is None  else pad_idx

    elif pad_idx is not None:             # padded_input : N x T

        assert padded_input.dim() == 2

        # Create a mask where non-padding positions are marked with 1 and padding positions are marked with 0
        non_pad_mask = padded_input.ne(pad_idx).float()

    return non_pad_mask.unsqueeze(-1)   # unsqueeze(-1) for broadcasting

## 2. `create_mask_2`: Mask for Preventing Attention to Subsequent Positions


```python
def create_mask_2(seq, pad_idx=None)
```


## Purpose:
This function creates a **subsequent mask** that prevents attention from attending to future positions in the sequence. It ensures that each position can only attend to previous positions (as in causal language modeling).

## Usage:
- **Input:**
  - `seq`: Tensor of shape `(batch_size, sequence_length)` representing the input sequence.
  - `pad_idx`: (Optional) Padding index for masking padding positions.

- **Output:**
  - A mask of shape `(batch_size, sequence_length, sequence_length)` where the upper triangular portion is filled with 1s to prevent attention to future positions.


In [None]:


def create_mask_2(seq, pad_idx=None):
    """ Create a mask to prevent positions from attending to subsequent positions.

    Args:
        seq: The input sequence tensor, shape (batch_size, sequence_length).

    Returns:
        A mask tensor with shape (batch_size, sequence_length, sequence_length),
            where positions are allowed to attend to previous positions but not to subsequent positions.
    """

    sz_b, len_s = seq.size()

    # Create an upper triangular matrix with zeros on the diagonal and below (indicating allowed positions)
    #   and ones above the diagonal (indicating disallowed positions)
    subsequent_mask = torch.triu(
        torch.ones((len_s, len_s), device=seq.device, dtype=torch.uint8), diagonal=1)

    # Expand the mask to match the batch size, resulting in a mask for each sequence in the batch.
    mask = subsequent_mask.unsqueeze(0).expand(sz_b, -1, -1)  # b x ls x ls


    ''' Create a mask to ignore padding positions in the key sequence during attention calculation. '''

    # Expanding to fit the shape of key query attention matrix.
    if pad_idx != None:
        len_q = seq.size(1)

          # Create a mask where padding positions in the key sequence are marked with 1.
        padding_mask  = seq.eq(pad_idx)

          # Expand the mask to match the dimensions of the key-query attention matrix.
        padding_mask  = padding_mask.unsqueeze(1).expand(-1, len_q, -1)  # b x lq x lk


        mask          = (padding_mask + mask).gt(0)

    else:
        mask = mask.gt(0)

    return mask


## Helper function for attention visualization

In [None]:
def plot_attention(attention):
    # Function for plotting attention
    # You need to get a diagonal plot
    plt.clf()
    sns.heatmap(attention, cmap='GnBu')
    plt.show()

def visualize_attention(attention_weights, index=0):
    plt.figure(figsize=(10, 8))
    plt.imshow(attention_weights[index].detach().cpu().numpy(), cmap='viridis')
    plt.colorbar()
    plt.title(f'Attention Weights for Sample {index}')
    plt.xlabel('Key Position')
    plt.ylabel('Query Position')
    plt.show()


# Transformer Decoder Components

We will use these components in the Transformer decoder. These include positional encoding, feed-forward networks, scaled dot-product attention, and multi-head attention. Each of these components plays a vital role in processing input sequences and computing attention in the Transformer model.

---

## 1. **Positional Encoding (`PositionalEncoding`)**
Transformers do not inherently capture the order of sequences, so positional encodings are used to introduce sequence order into the model.

- **Purpose**: Adds information about the position of each token in the input sequence.
- **Mechanism**: Uses a combination of sine and cosine functions of different frequencies to generate positional encodings.
- **Parameters**:
  - `projection_size`: The size of the input embeddings (i.e., `d_model`).
  - `max_seq_len`: The maximum length of the input sequence (default: 1000).
- **Output**: The input embedding enriched with positional information, which is passed through a dropout layer for regularization.

---


In [None]:
class PositionalEncoding(torch.nn.Module):

    def __init__(self, projection_size, max_seq_len= 1000, dropout=0.1):
        super().__init__()
        self.dropout                = torch.nn.Dropout(dropout)

        pe              = torch.zeros(max_seq_len, projection_size)
        position        = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
        div_term        = torch.exp(torch.arange(0, projection_size, 2).float() * (-math.log(10000.0) / projection_size))
        pe[:, 0::2]     = torch.sin(position * div_term)
        pe[:, 1::2]     = torch.cos(position * div_term)
        pe              = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return self.dropout(x + self.pe[:, :x.size(1)])





## 2. **Feed-Forward Network (`FeedForward`)**
The feed-forward network is a fully connected layer applied independently to each position in the sequence after the attention layers.

- **Purpose**: Projects the intermediate representations to a higher-dimensional space and back to the original model dimension.
- **Mechanism**: Consists of two linear layers with a GeLU activation function and dropout in between.
- **Parameters**:
  - `d_model`: The input and output dimensionality of the model.
  - `d_ff`: The dimensionality of the hidden layer in the feed-forward network (default: 2048).
  - `dropout`: Dropout rate applied after the GeLU activation (default: 0.1).
- **Output**: The transformed input sequence passed through two linear transformations with non-linear activation in between.

---

In [None]:

class FeedForward(torch.nn.Module):
    ''' Projection Layer (Fully Connected Layers) '''

    def __init__(self, d_model, d_ff=2048, dropout=0.1):
        super().__init__()

        self.linear_1   = torch.nn.Linear(d_model, d_ff)
        self.dropout    = torch.nn.Dropout(dropout)
        self.linear_2   = torch.nn.Linear(d_ff, d_model)

    def forward(self, x):

        # Apply the first linear layer, GeLU activation, and then dropout
        x = self.dropout(torch.nn.functional.gelu(self.linear_1(x)))

         # Apply the second linear layer to project the dimension back to d_model
        x = self.linear_2(x)

        return x




## 3. **Scaled Dot-Product Attention (`ScaledDotProductAttention`)**
This module computes the attention score for each query-key pair in the input sequence using the scaled dot-product mechanism.

- **Purpose**: To compute attention scores and generate weighted outputs based on the input query, key, and value matrices.
- **Mechanism**:
  - Calculates the dot product of queries and keys, scales by the square root of the dimension, and applies a softmax to generate attention weights.
  - Uses dropout for regularization.
- **Parameters**:
  - `temperature`: Scaling factor for the dot product.
  - `attn_dropout`: Dropout rate for attention weights (default: 0.1).
- **Output**: Returns the weighted sum of the values and the attention weights.

---



In [None]:
class ScaledDotProductAttention(torch.nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature    = temperature                       # Scaling factor for the dot product
        self.dropout        = torch.nn.Dropout(attn_dropout)    # Dropout layer for attention weights
        self.softmax        = torch.nn.Softmax(dim=-1)           # Softmax layer along the attention dimension

    def forward(self, q, k, v, mask=None):

        # Calculate the dot product between queries and keys.
        # attn = torch.bmm(q, k.transpose(1, 2))
        attn = (q @ k.transpose(-2, -1))

        # Scale the dot product by the temperature.
        attn = attn / self.temperature

        if mask is not None:
            # Apply the mask by setting masked positions to a large negative value.
            # This ensures they have a softmax score close to zero.
            attn = attn.masked_fill(mask, float('-inf'))

        # Apply softmax to obtain attention weights.
        attn    = self.softmax(attn)

        # Apply dropout to the attention weights.
        # Compute the weighted sum of values based on the attention weights.
        # output  = torch.bmm(self.dropout(attn), v)
        attn = self.dropout(attn)
        output = attn @ v

        return output, attn # Return the attention output and the attention weights.


## 4. **Multi-Head Attention (`MultiHeadAttention`)**
This module implements multi-head attention, where multiple sets of attention heads are computed in parallel, and their outputs are concatenated.

- **Purpose**: To allow the model to jointly attend to different positions in the input sequence from different representation subspaces.
- **Mechanism**:
  - Projects the input query, key, and value matrices into multiple smaller subspaces (heads).
  - Computes scaled dot-product attention for each head in parallel.
  - Concatenates the outputs of all heads and applies a final linear transformation to project the result back to the original model dimension.
- **Parameters**:
  - `n_head`: Number of attention heads.
  - `d_model`: Dimensionality of the input and output representations.
  - `dropout`: Dropout rate applied to the attention output (default: 0.1).
- **Output**: Returns the concatenated output of all attention heads and the averaged attention weights.

---

In [None]:
class MultiHeadAttention(torch.nn.Module):
    ''' Multi-Head Attention Module '''

    def __init__(self, n_head, d_model, dropout=0.1):
        super().__init__()

        self.n_head = n_head # Number of attention heads
        self.d_k    = d_model // n_head
        self.d_v    = d_model // n_head


        # Linear layers for projecting the input query, key, and value to multiple heads
        self.w_qs   = torch.nn.Linear(d_model, n_head * self.d_k)
        self.w_ks   = torch.nn.Linear(d_model, n_head * self.d_k)
        self.w_vs   = torch.nn.Linear(d_model, n_head * self.d_v)

        torch.nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + self.d_k)))
        torch.nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + self.d_k)))
        torch.nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + self.d_v)))

        # Initialize the weights of the linear layers
        self.attention = ScaledDotProductAttention(
            temperature=np.power(self.d_k, 0.5), attn_dropout=dropout)

        # Final linear layer to project the concatenated outputs of the attention heads back to the model dimension
        self.fc = torch.nn.Linear(n_head * self.d_v, d_model)
        torch.nn.init.normal_(self.fc.weight)

        self.dropout = torch.nn.Dropout(dropout)

    def forward(self, q, k, v, mask=None):

        # following key, value, query standard computation
        d_k, d_v, n_head    = self.d_k, self.d_v, self.n_head
        sz_b, len_q, _      = q.size()
        sz_b, len_k, _      = k.size()
        sz_b, len_v, _      = v.size()

        # Project the input query, key, and value to multiple heads
        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)

        # Rearrange the dimensions to group the heads together for parallel processing
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)


        # Repeat the mask for each attention head if a mask is provided
        if mask is not None:
              # print(mask.shape)
              mask = mask.unsqueeze(1).repeat(1, n_head, 1, 1)

        # Apply scaled dot-product attention to the projected query, key, and value
        output, attn    = self.attention(q, k, v, mask=mask)

        # Rearrange the output back to the original order and concatenate the heads
        output = output.transpose(1, 2).contiguous().view(sz_b, len_v, -1)

        output          = self.dropout(self.fc(output))

        attn_weights = attn.mean(dim=(0, 1))

        return output, attn_weights


# Transformer Decoder Layers

The `DecoderLayer1`, `DecoderLayer2`, and `DecoderLayer3` are modular components of the Transformer decoder. Each layer is designed to handle a specific function: self-attention, cross-attention, and feed-forward processing.

## 1. `DecoderLayer1`: Self-Attention Layer
- **Purpose**: Implements self-attention, where the decoder attends to its own inputs, combined with residual connections and layer normalization.
- **Components**:
  - `MultiHeadAttention`: Applies self-attention to the target sequence.
  - `LayerNorm`: Normalizes the output after the residual connection.
  - `Dropout`: Regularization to prevent overfitting.

## 2. `DecoderLayer2`: Cross-Attention Layer
- This layer implements cross-attention, where the decoder attends to the output of an encoder. However, for this homework, we will not use `DecoderLayer2` during the pretraining phase because we do not have an encoder in our setup. We will describe its functionality in part 2 of this homework.

## 3. `DecoderLayer3`: Feed-Forward Layer
- **Purpose**: Implements a feed-forward neural network for further transformation of the decoder's intermediate representations.
- **Components**:
  - `FeedForward`: A two-layer fully connected network with non-linearity.
  - `LayerNorm`: Applied after the residual connection.
  - `Dropout`: Regularization to avoid overfitting.



In [None]:
class DecoderLayer1(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        """
        DecoderLayer (attention and layer norm) in the Transformer architecture.

        Args:
            d_model (int): The number of expected features in the input (embedding dimension).
            num_heads (int): Number of attention heads.
            d_ff (int): Dimension of the feedforward network model.
            dropout (float): Dropout probability.
        """
        super(DecoderLayer1, self).__init__()


        self.self_attn = MultiHeadAttention(num_heads, d_model, dropout=dropout)
        self.layer_norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt, attn_mask=None, key_padding_mask=None):
        is_inference = not self.self_attn.training

        tgt2 = self.layer_norm(tgt)
        tgt2, attn_weights = self.self_attn(
            tgt2, tgt2, tgt2, attn_mask)

        tgt = tgt + self.dropout(tgt2)
        return tgt, attn_weights


class DecoderLayer3(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        """
        Feedforward layer with layer normalization in the Transformer decoder.

        Args:
            d_model (int): Embedding dimension.
            num_heads (int): Number of attention heads.
            d_ff (int): Dimension of the feedforward network.
            dropout (float): Dropout probability.
        """
        super(DecoderLayer3, self).__init__()
        self.ffn = FeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, tgt):
        tgt2 = self.layer_norm(tgt)
        tgt2 = self.ffn(tgt2)
        tgt = tgt + self.dropout(tgt2)
        return tgt


# Causal Language Model

This module implements a Transformer-based decoder for causal language modeling (CLM). It consists of several components, including embedding layers, positional encoding, self-attention layers, and feed-forward layers. It supports various generation strategies such as beam search and sampling.

### Key Components:
- **Embedding Layer**: Converts input tokens into dense vector representations.
- **Positional Encoding**: Adds position information to input tokens, helping the model understand the order of tokens.
- **Decoder Layers**: Composed of:
  - `DecoderLayer1`: Implements self-attention and layer normalization.
  - `DecoderLayer3`: Implements a feed-forward network with residual connections.
- **Output Linear Layer**: Projects the hidden states to the vocabulary size to generate output probabilities.

### Key Methods:
- **`forward`**: Runs the input through the decoder layers and generates output probabilities.
- **`predict_beam`**: Implements beam search for sequence generation, selecting the most likely sequence.
- **`predict_beam_sampling`**: Uses beam search with probabilistic sampling for sequence generation.
- **`generate`**: Generates a sequence step-by-step (greedy) for a given input sequence.
- **`predict`**: Predicts the next token given the current input.
- **`vec2text`**: Converts predicted vectors to human-readable text.


In [None]:
# Here comes the main portion of this HW.


class CausalLanguageModel(nn.Module):
    def __init__(self, vocab_size=31, d_model=256, num_layers=2, num_heads=2, d_ff=512, dropout=0.1, max_length=1000):

        """
        Decoder module in the Transformer architecture.
        Initializes embeddings, multiple decoder layers, and an output linear layer.

        Args:
            vocab_size (int): Size of the vocabulary.
            d_model (int): The number of expected features in the input (embedding dimension).
            num_layers (int): Number of decoder layers.
            num_heads (int): Number of attention heads.
            d_ff (int): Dimension of the feedforward network model.
            dropout (float): Dropout probability.
            max_length (int): Maximum length of input sequences.
        """

        super(CausalLanguageModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx = PAD_TOKEN)


        self.pos_encoder = PositionalEncoding(d_model, max_length, dropout)
        self.num_layers= num_layers
        self.dec_layers1 = nn.ModuleList([DecoderLayer1(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.dec_layers3 = nn.ModuleList([DecoderLayer3(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.ln = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model, vocab_size)


        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)



    def forward(self, inp):
        size = inp.size(1)
        key_padding_mask = (inp == PAD_TOKEN)  # padding mask to ignore padded positions in attention computation
        attn_mask = create_mask_2(inp, pad_idx=PAD_TOKEN)
        key_padding_mask = create_mask_1(inp, pad_idx=PAD_TOKEN)

        inp = self.embedding(inp) * math.sqrt(self.embedding.embedding_dim) # ( batch_size, seq_len, d_model )
        inp = self.pos_encoder(inp)

        # print(attn_mask[0])

        attention_weights_list = []

        for i in range(self.num_layers):
            inp, attn_weights = self.dec_layers1[i](inp, attn_mask, key_padding_mask)
            inp = self.dec_layers3[i](inp)
            attention_weights_list.append(attn_weights)

        output = self.fc(self.ln(inp))

        stacked_attention_weights = torch.stack(attention_weights_list, dim=0)

        return output, stacked_attention_weights



    def predict_beam(self, x, timesteps, beam_width=20):
        import torch.nn.functional as F
        import torch

        x = torch.tensor(x).long().to(DEVICE)
        batch_size, seq_len = x.shape
        vocab_size = self.embedding.weight.shape[0]  # Assuming this is 31

        # Initialize beam for each item in the batch
        beam = [(torch.zeros(batch_size).to(DEVICE), x, []) for _ in range(beam_width)]

        for _ in range(timesteps):  # We only need to predict the remaining timesteps
            candidates = []
            for log_prob, seq, prob_dists in beam:
                with torch.inference_mode():
                    y, _ = self.forward(seq)
                    last_prob = y[:, -1, :]  # Get last time step output
                    prob_dists.append(last_prob)  # Store distribution

                    # Apply softmax to convert logits to probabilities
                    probs = F.softmax(last_prob, dim=-1)
                    top_probs, top_indices = probs.topk(beam_width, dim=-1)

                    # Create new candidates
                    for i in range(beam_width):
                        next_token = top_indices[:, i]
                        new_seq = torch.cat((seq, next_token.unsqueeze(1)), dim=1)
                        new_prob = log_prob + top_probs[:, i].log()  # Update log prob
                        candidates.append((new_prob, new_seq, prob_dists.copy()))

            # Select the top `beam_width` candidates for each item in the batch
            beam = sorted(candidates, key=lambda x: x[0].mean().item(), reverse=True)[:beam_width]

        # Select the best candidate
        best_candidate = max(beam, key=lambda x: x[0].mean().item())
        log_likelihood, best_seq, best_prob_dists = best_candidate

        # Extract only the predicted part, removing the original input
        predicted_seq = best_seq[:, seq_len:]
        log_likelihood = log_likelihood

        return log_likelihood, predicted_seq


    def predict_beam_sampling(self, x, timesteps, beam_width=20):
        import torch.nn.functional as F
        import torch

        x = torch.tensor(x).long().to(DEVICE)
        batch_size, seq_len = x.shape
        vocab_size = self.embedding.weight.shape[0]  # Assuming this is 31

        # Initialize beam for each item in the batch
        beam = [(torch.zeros(batch_size).to(DEVICE), x, []) for _ in range(beam_width)]

        for _ in range(timesteps):  # We only need to predict the remaining timesteps
            candidates = []
            for log_prob, seq, prob_dists in beam:
                with torch.inference_mode():
                    y, _ = self.forward(seq)
                    last_prob = y[:, -1, :]  # Get last time step output
                    prob_dists.append(last_prob)  # Store distribution

                    # Apply softmax to convert logits to probabilities
                    probs = F.softmax(last_prob, dim=-1)

                    # Sample `beam_width` new candidates based on probabilities
                    sampled_indices = torch.multinomial(probs, beam_width, replacement=True)
                    for i in range(beam_width):
                        next_token = sampled_indices[:, i]
                        new_seq = torch.cat((seq, next_token.unsqueeze(1)), dim=1)
                        # Compute log probabilities of sampled indices
                        new_prob = log_prob + probs.gather(1, next_token.unsqueeze(1)).log().squeeze(1)
                        candidates.append((new_prob, new_seq, prob_dists.copy()))

            # Select the top `beam_width` candidates for each item in the batch
            beam = sorted(candidates, key=lambda x: x[0].mean().item(), reverse=True)[:beam_width]

        # Select the best candidate
        best_candidate = max(beam, key=lambda x: x[0].mean().item())
        log_likelihood, best_seq, best_prob_dists = best_candidate

        # Extract only the predicted part, removing the original input
        predicted_seq = best_seq[:, seq_len:]
        log_likelihood = log_likelihood

        return log_likelihood, predicted_seq

    def predict_nucleus_sampling(self, x, timesteps, p=0.95):
        import torch.nn.functional as F
        import torch

        x = torch.tensor(x).long().to(DEVICE)
        batch_size, seq_len = x.shape

        # Initialize sequences and log probabilities
        seq = x
        log_prob = torch.zeros(batch_size).to(DEVICE)
        prob_dists = []

        for _ in range(timesteps):  # We only need to predict the remaining timesteps
            with torch.inference_mode():
                y, _ = self.forward(seq)
                last_prob = y[:, -1, :]  # Get last time step output
                prob_dists.append(last_prob)  # Store distribution

                # Apply softmax to convert logits to probabilities
                probs = F.softmax(last_prob, dim=-1)

                # Sort the probabilities and their corresponding indices
                sorted_probs, sorted_indices = torch.sort(probs, descending=True, dim=-1)

                # Calculate cumulative probabilities
                cumulative_probs = torch.cumsum(sorted_probs, dim=-1)

                # Select indices where cumulative probability <= p
                top_p_mask = cumulative_probs <= p
                top_p_mask[:, 0] = True  # Ensure at least one token is selected

                # Mask out probabilities outside the nucleus (i.e., top-p subset)
                top_p_probs = sorted_probs * top_p_mask

                # Normalize the probabilities within the top-p set
                normalized_top_p_probs = top_p_probs / top_p_probs.sum(dim=-1, keepdim=True)

                # Sample the next token from the top-p subset
                sampled_indices = torch.multinomial(normalized_top_p_probs, 1).squeeze(1)

                # Map sampled indices back to the original token space
                next_token = sorted_indices.gather(1, sampled_indices.unsqueeze(1)).squeeze(1)

                # Update sequence with the newly sampled token
                seq = torch.cat((seq, next_token.unsqueeze(1)), dim=1)

                # Update log probabilities
                log_prob = log_prob + probs.gather(1, next_token.unsqueeze(1)).log().squeeze(1)

        # Extract only the predicted part, removing the original input
        predicted_seq = seq[:, seq_len:]

        return log_prob, predicted_seq


    def generate(self, x, timesteps):
        # Refer to section 1.2.4 to understand this function
        # Important Note: We do not draw <eos> from the distribution unlike the writeup

        timesteps -= 1
        x = torch.tensor(x).long().to(DEVICE)

        # TODO: Pass the input sequence through the model
        # Obtain the probability distribution
        # token_prob_dist  = self.forward(x)

        # TODO: Draw the next predicted token from the probability distribution ()
        # next_token                              = token_prob_dist[:,-1,:].argmax(dim=-1).unsqueeze(1)
        # What would generated_sequence be initialized with?
        generated_sequence  = [] # maybe '<sos>'
        with torch.inference_mode():
            for t in range(timesteps): # Loop through the timesteps

                # TODO: Pass the next_token through the model
                next_prob_dist, _ = self.forward(x)
                # TODO: You will get 1 output. What is the shape of the probability distribution?
                dist_shape = next_prob_dist.shape

                # TODO: Get the most probable token for the next timestep
                next_token = torch.argmax(next_prob_dist[:,-1,:], dim=-1).unsqueeze(1)

                x = torch.cat([x, next_token], dim=1)

                generated_sequence.append(next_token)



            generated_sequence = torch.stack(generated_sequence, dim= 1) # keep last timesteps generated words

        return generated_sequence.squeeze(-1)


    def predict(self, x, timesteps=20):
        # Refer to Section 1.2.6 to understand this function


        x = torch.tensor(x).long().to(DEVICE)
        prob_dists = []  # List to store each output distribution
        with torch.inference_mode():
            for _ in range(timesteps):
                y, _ = self.forward(x)
                last_prob = y[:, -1, :]  # Get the last time step output
                prob_dists.append(last_prob)

                # Use the argmax to select the next token
                next_token = last_prob.argmax(dim=-1, keepdim=True)

                # Concatenate the predicted token to the input sequence for the next prediction
                x = torch.cat((x, next_token), dim=1)


        # Stack all collected probability distributions into a tensor

        return torch.stack(prob_dists, dim=1)


    def vec2text(self, inp, pred):

        # batch size, seq leength, vocab

        pred = torch.Tensor(pred)

        generated_sequence  = []
        pred = pred[:2]
        inp = inp[:2]
        with torch.inference_mode():
            for t in range(pred.shape[1]):

                next_token = torch.argmax(pred[:, t,:], dim=-1).unsqueeze(1)
                generated_sequence.append(next_token)


            generated_sequence = torch.stack(generated_sequence, dim= 1).squeeze(-1) # keep last timesteps generated words

        # convert to text
        generated_texts_test  = make_generation_text(inp, generated_sequence, VOCAB)

        return generated_texts_test



# Model, Loss, Optimizer, and Scheduler Definition

In [None]:
# Define the model
model = CausalLanguageModel(
    vocab_size = len(VOCAB),
    d_model    = config['d_model'],
    num_layers = config['num_layers'],
    num_heads  = config['num_heads'],
    d_ff       = config['ff'],
    dropout    = config['dropout'],
    max_length = config['max_length']
).to(DEVICE)

# Define the dataloader
loader = updated_DataLoaderForLanguageModeling(
    dataset=dataset,
    batch_size=config['batch_size'],
    sequence_length=config['sequence_length'],
    shuffle=config['shuffle'],
    drop_last=config['drop_last']
)

# Define the criterion (CrossEntropyLoss with label smoothing)
criterion = torch.nn.CrossEntropyLoss(
    ignore_index=PAD_TOKEN,
    label_smoothing=0.1
)

# Define the optimizer based on config (Adam or AdamW)
if config["optimizer"] == "Adam":
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=config['learning_rate'],
        weight_decay=2e-6
    )
elif config["optimizer"] == "AdamW":
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=config['learning_rate'],
        weight_decay=1e-5
    )

# Define the learning rate scheduler
if config["scheduler"] == "ReduceLR":
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        factor=config["factor"],
        patience=config["patience"],
        min_lr=1E-8,
        verbose=True
    )
elif config["scheduler"] == "CosineAnnealing":
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer,
        T_max=config["num_epochs"],
        eta_min=1E-8
    )

# Define the scaler for mixed precision training
scaler = torch.cuda.amp.GradScaler()

# Print the model architecture and parameter summary
print(model)

# Optionally, if you want to summarize the model, make sure `torchsummaryX` is installed
summary = torchsummaryX.summary(model.to(DEVICE), x=torch.tensor(inputs).to(DEVICE))





# Trainer Class

In [None]:
# Unlike all the P2s, we are using a Trainer class for this HW.
# Many researchers also use classes like this for training. You may have encountered them in your project as well.
# You dont have to complete everything in this class, you only need to complete the train function.
# However, its good to go through the code and see what it does.



class Trainer:
    def __init__(self, model, loader, optimizer, criterion, scheduler, scaler, max_epochs= 1, run_id= 'exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.model      = model
        self.loader     = loader
        self.optimizer  = optimizer
        self.criterion  = criterion
        self.scheduler  = scheduler
        self.scaler     = scaler

        self.train_losses           = []
        self.val_losses             = []
        self.prediction_probs       = []
        self.prediction_probs_test  = []
        self.generated_texts_test   = []
        self.generated_texts_test_beam = []
        self.generated_texts_test_beam_random = []

        self.log_likelihood_beam = []
        self.log_likelihood_beam_random = []

        self.epochs                 = 0
        self.max_epochs             = max_epochs
        self.run_id                 = run_id


    def calculate_loss(self, out, target):
        # output: (B, T, Vocab_size) - probability distributions
        # target: (B, T)
        # Read the documentation of CrossEntropyLoss and try to understand how it takes inputs

        # Tip: If your target is of shape (B, T) it means that you have B batches with T words.
        # Tip: What is the total number of words in this batch?
        # Tip: Crossentropy calculates the loss between a label and its probability distribution.

        out     = out.reshape(-1, len(VOCAB)) # TODO
        targets = target.reshape(-1) # TODO
        loss    = self.criterion(out, targets)

        return loss


    def train(self):

        self.model.train() # set to training mode
        self.model.to(DEVICE)
        epoch_loss  = 0
        num_batches = 0

        for batch_num, (inputs, targets) in enumerate(tqdm(self.loader)):

            # TODO: Complete the loop. You should be able to complete this without any helper comments after 3 HWs
            # Tip: Use Mixed Precision Training
            # For loss calculation, use the calculate_loss function. You need to complete it before using.

            inputs = torch.tensor(inputs).long().to(DEVICE)
            targets = torch.tensor(targets).long().to(DEVICE)

            self.optimizer.zero_grad()

            with torch.cuda.amp.autocast():
                out, attn_weights  = self.model(inputs)
                loss = self.calculate_loss(out=out, target=targets)

            loss_item = loss.item()
            epoch_loss += loss_item

            self.scaler.scale(loss).backward() # This is a replacement for loss.backward()
            self.scaler.step(self.optimizer) # This is a replacement for optimizer.step()
            self.scaler.update() # This is something added just for FP16
            

        epoch_loss = epoch_loss / (batch_num + 1)
        self.epochs += 1
        print('[TRAIN] \tEpoch [%d/%d] \tLoss: %.4f \tLr: %.6f'
                    % (self.epochs, self.max_epochs, epoch_loss, self.optimizer.param_groups[0]['lr']))
        self.train_losses.append(epoch_loss)

        return (epoch_loss, self.optimizer.param_groups[0]['lr'], attn_weights)



    def test(self): # Don't change this function

        self.model.eval() # set to eval mode
        prediction_probs     = self.model.predict(fixtures_pred['inp']).detach().cpu().numpy() # get predictions
        self.prediction_probs.append(prediction_probs)

        generated_indexes_test   = self.model.generate(fixtures_gen_test['inp'], 30).detach().cpu().numpy() # generated predictions for 10 words

        nll                   = get_prediction_nll(prediction_probs, fixtures_pred['out'])

        log_likelihood, predicted_beam_indices = self.model.predict_beam(fixtures_gen_test['inp'], 30)
        predicted_beam_indices = predicted_beam_indices.detach().cpu().numpy()
        log_likelihood_random, predicted_beam_random_indices = self.model.predict_beam_sampling(fixtures_gen_test['inp'], 30)
        predicted_beam_random_indices = predicted_beam_random_indices.detach().cpu().numpy()


        self.log_likelihood_beam.append(log_likelihood)
        self.log_likelihood_beam_random.append(log_likelihood_random)
#         print(predicted_beam_indices.shape)
#         print(generated_indexes_test.shape)

        predicted_beam_text = make_generation_text(fixtures_gen_test['inp'], predicted_beam_indices, VOCAB)
        predicted_beam_random_text = make_generation_text(fixtures_gen_test['inp'], predicted_beam_random_indices, VOCAB)

        generated_texts_test  = make_generation_text(fixtures_gen_test['inp'], generated_indexes_test, VOCAB)

        self.val_losses.append(nll)

        self.generated_texts_test.append(generated_texts_test)
        self.generated_texts_test_beam.append(predicted_beam_text)
        self.generated_texts_test_beam_random.append(predicted_beam_random_text)

        # generate predictions for test data
        prediction_probs_test = self.model.predict(fixtures_pred_test['inp']).detach().cpu().numpy() # get predictions
        self.prediction_probs_test.append(prediction_probs_test)

        print('[VAL] \tEpoch [%d/%d] \tLoss: %.4f'
                      % (self.epochs, self.max_epochs, nll))

        return nll


    def save(self): # Don't change this function

        model_path = os.path.join('hw4p1/experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()}, model_path)
        np.save(os.path.join('hw4p1/experiments', self.run_id, 'prediction-probs-{}.npy'.format(self.epochs)), self.prediction_probs[-1])
        np.save(os.path.join('hw4p1/experiments', self.run_id, 'prediction-probs-test-{}.npy'.format(self.epochs)), self.prediction_probs_test[-1])

        with open(os.path.join('hw4p1/experiments', self.run_id, 'generated-texts-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_texts_test[-1])


        #########################################
        generated_beam = []

        for line, score  in zip(self.generated_texts_test_beam[-1].split("\n"), self.log_likelihood_beam[-1]):
            generated_beam += [line +  f"\t Score:{score.item():0.4f}"]


        with open(os.path.join('hw4p1/experiments', self.run_id, 'generated-texts-beam-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write("\n".join(generated_beam))


        generated_beam = []

        for line, score  in zip(self.generated_texts_test_beam_random[-1].split("\n"), self.log_likelihood_beam_random[-1]):
            generated_beam += [line +  f"\t Score:{score.item():0.4f}"]


        with open(os.path.join('hw4p1/experiments', self.run_id, 'generated-texts-beam-random-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write("\n".join(generated_beam))




In [None]:
# Dont change this cell

run_id = str(int(time.time()))
if not os.path.exists('./hw4p1/experiments'):
    os.mkdir('./hw4p1/experiments')
os.mkdir('./hw4p1/experiments/%s' % run_id)
print("Saving models, prediction prbabilities, and generated texts to ./hw4p1/experiments/%s" % run_id)

# The object of the Trainer class takes in everything
trainer = Trainer(
    model       = model,
    loader      = loader,

    optimizer   = optimizer,
    criterion   = criterion,
    scheduler   = scheduler,
    scaler      = scaler,
    max_epochs  = config['num_epochs'],
    run_id      = run_id
)

# Wandb

In [None]:
# Use wandb? Resume Training?
USE_WANDB = True
RESUME_LOGGING = False

# Create your wandb run

run_name = '{}_d_model:_{}_ff:_{}'.format(
    config['TA'],
    config['d_model'],
    config['ff'],

)

if USE_WANDB:

    wandb.login(key="2ed342afb6f5e8a1ef3c71c50f0f4ecb7a2f6dbe") # IDL course key, please don't change

    if RESUME_LOGGING:
        run_id = ''
        run = wandb.init(
            settings=wandb.Settings(symlink=False),
            id     = run_id, ### Insert specific run id here if you want to resume a previous run
            resume = "must", ### You need this to resume previous runs, but comment out reinit = True when using this
            project = "hw4p1-f24", ### Project should be created in your wandb account
        )
    else:
        run = wandb.init(
            name    = run_name, ### Wandb creates random run names if you skip this field, we recommend you give useful names
            reinit  = True, ### Allows reinitalizing runs when you re-run this cell
            project = "hw4p1-f24", ### Project should be created in your wandb account
            config  = config ### Wandb Config for your run
        )

        ### Save your model architecture as a string with str(model)
        model_arch  = str(model)
        ### Save it in a txt file
        arch_file   = open("model_arch.txt", "w")
        file_write  = arch_file.write(model_arch)
        arch_file.close()

        ### log it in your wandb run with wandb.save()
        # wandb.save('model_arch.txt')

# Experiments

In [None]:
# Run the experiments loop.
# Each epoch wont take more than 2-3min. If its taking more time, it might be due to (but not limited to) the following:
#   * You might be overlapping batches
#       Eg. Input: "I had biryani for lunch today" and sequence length = 3,
#           --> "I had biryani", "for lunch today" are ideal examples for inputs
#           --> "I had biryani", "had biryani for", "biryani for lunch", ... is just redundant info :')
#   * Your length calculation in the dataloader might be wrong
# If you haven't had biryani, try it :D

wandb.watch(model, log="all")

# torch.cuda.empty_cache()
gc.collect()

# %%time
best_nll = 1e30
for epoch in range(config['num_epochs']):
    train_loss, curr_lr,  attn_weights = trainer.train()
    print(attn_weights[-1].shape)
    plot_attention(attn_weights[-1].detach().cpu().numpy())
    visualize_attention(attn_weights)
    nll = trainer.test()
    if nll < best_nll:
        best_nll = nll
        print("Saving model, prediction probabilities and generated texts for epoch "+str(epoch+1)+" with NLL: "+ str(best_nll))

    trainer.save()

    wandb.log({"train_loss":train_loss,
               "nll": nll,
               "learning_rate": curr_lr
              })
    scheduler.step(nll)

### Finish your wandb run
run.finish()



In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
plt.figure()
plt.plot(range(1, trainer.epochs +1), trainer.train_losses, label='Training losses')
plt.plot(range(1, trainer.epochs +1), trainer.val_losses, label='Validation losses')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

# Evaluating generations

 - Now that you have trained your model and got satisfactory validation NLL on the token prediction task, you can evaluate the generations you created too
 - We will use the perplexity metric to evaluate generations using a large language model available through the HuggingFace.
 - Run the bellow cell to get the perplexity.
 - You will submit this perplexity value for grading the generation component of this homework.
 - A perplexity of under **1400** will give you full credit on the generation part.

### Change only the **submission_run_id**, **submission_epoch**, and **api_key** in the following cell

In [None]:
from huggingface_hub import login
login(token="<YOUR TOKEN>")

In [None]:
# DO NOT CHANGE THE CODE IN THIS CELL EXCEPT submission_run_id AND submission_epoch
# PLEASE BE HONEST IN REPORTING THE PERPLEXITY VALUE!
# WE WILL RANDOMLY CHECK SOME SUBMISSIONS USING THE SAME CODE AS THIS AND A BIG DIFFERENCE IN PERPLEXITY WILL RESULT IN AN AIV.


# Add you submission_run_id and submission_epoch here --------------------------------------------------
# Fill the run id and epoch number to be used for submission.
# You will use the same run id and epoch number to generate the handin.

submission_run_id = "1728150863" # TODO
submission_epoch = 1 # TODO

# --------------------------------------------------------------

n_tests = 128

with open(os.path.join('hw4p1/experiments', submission_run_id, 'generated-texts-{}-test.txt'.format(submission_epoch)), 'r', encoding='utf-8') as f:
    generated = list(f)

assert len(generated) == n_tests
for item in generated:
    assert type(item) is str

parsed_generated = []

for text in generated:
    temp = text.split(":")[-1].replace("<sos>", "")
    parsed_text = temp.replace(" | ", "")
    parsed_text = parsed_text.replace("<eos>", "\n")
    parsed_generated.append(parsed_text)



def calculate_perplexity(text, model, tokenizer):
    """Compute the perplexity of the provided text using a Hugging Face model."""
    encodings = tokenizer(text, return_tensors='pt')
    input_ids = encodings.input_ids.to(model.device)
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        perplexity = torch.exp(loss)
    return perplexity.item()

# --------------------------------------------------------------
# Define the model and tokenizer

model_name = "gpt2-medium"


model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype="auto").to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          torch_dtype="auto")
model.eval()  # Set the model to evaluation mode


# --------------------------------------------------------------


# Calculate perplexity for each generated sequence
perps = [calculate_perplexity(text, model, tokenizer) for text in tqdm(parsed_generated)]
avg_perp = np.mean(perps)

# Report this number when running the makefile to create the handin
print("Your mean perplexity for generated sequences: {}".format(avg_perp))



# Create handin
Navigate to the handout directory to run the below cell. This command will create the handin with all the required files (including attention.py). So make sure you have the entire handout directory wherever you are running this notebook (local machine, Colab, AWS, etc.). This command requires that this completed notebook be in the hw4 folder inside the handout directory.

In [None]:
# TODO: Generate the handin to submit to autolab

# For example:
# !make runid=1727189256 epoch=5 ppl=74.54703049361706

!make runid=1728150863 epoch=1 ppl=86.18322575837374