In [None]:
from datasets import load_dataset
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch
from torch import nn

### We will use the data below during this TP.

In [None]:
dataset = load_dataset('notaphoenix/shakespeare_dataset')
dataset # a Dataset object from the datasets library

In [None]:
dataset['training'][0]

### How many characters/words are there in total in the training split of the dataset ?

# 1. Byte-pair encoding
In this part, we will implement the BPE algorithm for tokenization.

In [None]:
# For convenience we use the training data as a list of str:
corpus = [elt['text'] for elt in dataset['training']]
corpus[0]

### 1.1 First step: build a list "tokenized_corpus" of same size as the corpus such that tokenized_corpus[i] is the list of characters in corpus[i] (just by splitting the str into its list of characters)
So tokenized_corpus starts by representing each corpus entry by a list of characters. As a reminder, Byte-pair encoding will iteratively refine it by merging the most frequent pairs of characters:
['a', 'b', 'c', 'a', 'b'] -> ['ab', 'c', 'ab']: this is the representation which we store in 'tokenized_corpus'.

### 1.2 Now build a dictionary "vocab" mapping each unique character in the corpus to a unique integer between 0 and the total number of characters - 1

### 1.3 Now implement a method which, given 'tokenized_corpus' computes a dictionary 'pairs' of which:
- the keys are pairs of elements appearing in the corpus (if corpus contains ['c', ..., 'a', 'b', ...] then the tuple ('a', 'b') should appear as key in pairs)
- the values are the number of times each pair appears in the corpus

(The right way is to build the pairs dictionary by iterating over the tokenized_corpus)

In [None]:
def get_pairs(tokenized_corpus) -> dict:
    return # TODO

### Now, given pairs, we can compute the most frequent pair of tokens via:

In [None]:
pair_to_merge = max(pairs, key=pairs.get)
pair_to_merge

### 1.4 Write a method which, given a pair_to_merge and the current tokenized_corpus and applies the merge operation to the tokenized_corpus
e.g. if the pair to merge is ('a', 'b') and some entry in tokenized_corpus contains ['c', ..., 'a', 'b'...], it should be mapped to ['c', ..., 'ab', ...]

In [None]:
def merge_pair(pair, tokenized_corpus):
    return # TODO

### 1.5 Now write the full byte-pair encoding algorithm which, given an initial corpus (list of str) returns its tokenized version as well as the vocabulary.
This function should take a parameter 'n_merge' indicating the number of merges we will do. 

What is the final length of the vocabulary ?

In [None]:
def byte_pair_encoding(corpus, num_merges: int = 10):
    return # TODO

vocab, tokenized_corpus = byte_pair_encoding(corpus, num_merges=50)

### 1.6 Now, given the obtained 'vocab' write a 'tokenize_to_str_list' method which applies the tokenization to an input string s, returns the list of tokens as a list of str
Verify your result on dataset['test'][0]

In [None]:
def tokenize_to_str_list(s: str, vocab):
    return # todo

tokenize_to_str_list(dataset['test'][0]['text'], vocab)

### 1.7 Now, given the obtained 'vocab' write a 'tokenize' method which applies the tokenization to an input string s, returns the list of integers corresponding to each symbol (the keys of vocab!)
Verify your result on dataset['test'][0]

In [None]:
def tokenize(s: str, vocab):
    return # TODO

tokenize(dataset['test'][0]['text'], vocab)

### => Yay, we are not able to map any input text to list of integers, not too long, not too short, of which representatens are not too rare, and which preserves (at least partially) the structure of the words
### 1.8 What are the current limitations of our implementation of BPE ?

## 2. Dataset preprocessing


## We Tokenize the full training and test splits of the datasets, to obtain test_ids and train_ids lists
A good way to proceed is to use the '.map' method see https://huggingface.co/docs/datasets/en/process#map

In [None]:
train_ids = dataset['training'].map(lambda s: {'ids': tokenize(s['text'], vocab)})['ids']
test_ids = dataset['test'].map(lambda s: {'ids': tokenize(s['text'], vocab)})['ids']
train_ids[0]

### 2.1 We will later on work on **batches** of data as is always the case when training deep learning models. (Why ?)
#### The problem here is that we cannot just stack the 'ids' as they have variable length:

In [None]:
dataloader = DataLoader(train_ids, batch_size=2, shuffle=True) # produces batch by 'collating' individual samples
for elt in dataloader: # will raise an error !
    print(elt) 

#### To solve this issue, we add a 'padding token' to the vocabulary

In [None]:
if '<PAD>' not in vocab:
    vocab['<PAD>'] = len(vocab)
    pad_id = len(vocab)
else:
    pad_id = len(vocab) - 1

### 2.3 implement a 'collate_batch_fn' which pads the sequences found as input using the pad_id, and returns the non-padding mask
For instance if input contains
[[1, 2, 3], [5, 6, 7, 8]]
you should return a torch tensor of type long [[1, 2, 3, pad_id], [5, 6, 7, 8]] as 'ids' and
a tensor of type long [[1, 1, 1, 0], [1, 1, 1, 1]] as mask


In [None]:
def collate_fn(batch): # here batch is just the list of lists of token_ids.
    return # TODO

### Verify that this now works:

In [None]:
dataloader = DataLoader(train_ids, batch_size=2, shuffle=True, collate_fn=collate_fn) # produces batch by 'collating' individual samples

for elt in dataloader: 
    print(elt) 

### Ok now we have a dataloader which yields batches of padded tokenized texts. We are ready to start implementing the transformers architecture. We start with the attention layer.

## 3. Attention layer
Remember the attention layer takes as input a (batched) sequence of hidden states (shape (B, T, H)), and returns a tensor of same dimension exactly containing the attention values.
We will start with an example. 
Let's assume we have this input x to the layer and a head_size of 4 (head_size is dimension of query/keys/values)

In [None]:
B, T, H = 2, 7, 16
x = torch.randn((B, T, H)) # batch  of 2 sequences of 7 hidden states of dimension 16 (B, T, H) = (2, 7, 16)
head_size = 4

### 3.1 Declare key, query and values pytorch modules to produce, from this x, the keys, queries and values. Apply these to get (B, T, head_size) tensors of K, Q, V
### We want the values to have shape (B, T, head_size)
hint: in torch, matrices are represented as bias-less Linear modules https://pytorch.org/docs/stable/generated/torch.nn.Linear.html

In [None]:
# TODO
# At the end of this cell, you should have v, q, k computed from x.

### 3.2 Q and K are (B, T, head_size) matrices. To compute the attention scores, we just need to compute the batch matrix multiplication of Q and K.transpose(1, 2): this op will do: (B, T, head_size) x (B, head_size, T) - (B, T, T): one (unnormalized) attention score for each pair of tokens. Implement this using torch.matmul.

In [None]:
# TODO

### 3.3 Now we need to apply the causal masking so that past tokens do not attend future tokens, but future tokens do attend past tokens. To do this, we create a triangular inferior mask for each element of the batch:

In [None]:
mask = torch.tril(torch.ones(T, T)).expand(B, T, T).bool()
mask

### Use this mask to set to -float('inf') the attention_scores where the mask if 0. 

In [None]:
# TODO

### 3.4 Apply the softmax to normalize row-wise the masked attention scores. Why did we set attention scores to -float('inf') outside of the mask ?

In [None]:
# TODO

### 3.5 With another matrix multiplication between the attention_scores (B, T, T) and the values (B, T, head_size), obtain the values as as (B, T, head_size) tensor.

In [None]:
# TODO

### Now gather it all to create an Attention nn.Module object implementing this attention operation.

In [None]:
class Attention(nn.Module):
    """ one head of self-attention """
    def __init__(self, hidden_state_dim, head_size):
        super().__init__()
        # TODO

    def forward(self, x):
        return # TODO
    
attention = Attention(hidden_state_dim=16, head_size = 4)

### 3.6 Now create a multi-head attention layer that, given some x, computes n_head attention separately, each with a head size of (hidden_state_dim / n_heads). Each head produces a (B, T, head_size) set of values. The multi head attention layer should concatenate them into a (B, T, head_size*n_heads=hidden_dim) and apply a final 'projection' layer which is just a linear map.
You can assume that n_heads divide hidden_state_dim.

In [None]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, hidden_state_dim, n_heads):
        super().__init__()
        return # TODO

    def forward(self, x):
        return # TODO
    
mha = MultiHeadAttention(16, 4)
mha(x).size()