In [1]:
from datasets import load_dataset
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch
from torch import nn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('notaphoenix/shakespeare_dataset')
dataset

DatasetDict({
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 429
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1072
    })
    training: Dataset({
        features: ['text', 'label'],
        num_rows: 3859
    })
})

In [3]:
dataset['training'][0]

{'text': 'How well my comfort is revived by this !\n', 'label': 1}

### How many characters/words are there in total in the train split of the dataset ?

In [4]:
n_chars = sum(len(elt['text']) for elt in dataset['training'])
n_words = sum(len(elt['text'].split(' ')) for elt in dataset['training'])
n_chars, n_words

(179380, 40229)

# 1. Byte-pair encoding

In [5]:
# For convenience we use the training data as a list of str:
corpus = [elt['text'] for elt in dataset['training']]
corpus[0]

'How well my comfort is revived by this !\n'

### 1.1 First step: build a list "tokenized_corpus" of same size as the corpus such that tokenized_corpus[i] is the list of characters in corpus[i] (just by splitting the str into its list of characters)
So tokenized_corpus starts by representing each corpus entry by a list of characters. As a reminder, Byte-pair encoding will iteratively refine it by merging the most frequent pairs of characters:
['a', 'b', 'c', 'a', 'b'] -> ['ab', 'c', 'ab']: this is the representation which we store in 'tokenized_corpus'.

In [6]:
tokenized_corpus = [list(word) for word in corpus]

### 1.2 Now build a dictionary "vocab" mapping each unique character in the corpus to a unique integer between 0 and the total number of characters - 1

In [7]:
vocab = {char: i for i, char in enumerate(set("".join(corpus)))}
vocab.items()

dict_items([('G', 0), (';', 1), (',', 2), ('h', 3), ('F', 4), ('D', 5), ('i', 6), ('b', 7), ('r', 8), ('o', 9), ('g', 10), ('K', 11), ('N', 12), ('x', 13), ('\n', 14), ('T', 15), ('C', 16), ('Y', 17), ('W', 18), ('f', 19), ('z', 20), ('H', 21), ('v', 22), ('B', 23), ('J', 24), ('.', 25), (':', 26), ('Z', 27), ('(', 28), ('q', 29), ('k', 30), ('s', 31), ('"', 32), ('a', 33), ('m', 34), ('V', 35), ('t', 36), ('j', 37), ('-', 38), ('Q', 39), ('O', 40), ('S', 41), ('A', 42), ('R', 43), ('w', 44), ('y', 45), (')', 46), ('?', 47), ('P', 48), ('E', 49), ('e', 50), ('u', 51), ('I', 52), ('M', 53), ('l', 54), ('n', 55), ("'", 56), ('p', 57), ('0', 58), (' ', 59), ('U', 60), ('!', 61), ('L', 62), ('d', 63), ('c', 64)])

### 1.3 Now implement a method which, given 'tokenized_corpus' computes a dictionary 'pairs' of which:
- the keys are pairs of elements appearing in the corpus (if corpus contains ['c', ..., 'a', 'b', ...] then the tuple ('a', 'b') should appear as key in pairs)
- the values are the number of times each pair appears in the corpus

(The right way is to build the pairs dictionary by iterating over the tokenized_corpus)

In [8]:
def get_pairs(tokenized_corpus):
    """Get the frequency of adjacent pairs in the words."""
    pairs = {}
    for word in tokenized_corpus:
        for i in range(len(word) - 1):
            if (word[i], word[i + 1]) not in pairs:
                pairs[(word[i], word[i + 1])] = 0
            pairs[(word[i], word[i + 1])] += 1
    return pairs

pairs = get_pairs(tokenized_corpus)
pairs.items()

dict_items([(('H', 'o'), 80), (('o', 'w'), 609), (('w', ' '), 515), ((' ', 'w'), 1722), (('w', 'e'), 461), (('e', 'l'), 650), (('l', 'l'), 1193), (('l', ' '), 1293), ((' ', 'm'), 2017), (('m', 'y'), 516), (('y', ' '), 2372), ((' ', 'c'), 972), (('c', 'o'), 490), (('o', 'm'), 739), (('m', 'f'), 16), (('f', 'o'), 506), (('o', 'r'), 1106), (('r', 't'), 337), (('t', ' '), 3782), ((' ', 'i'), 1329), (('i', 's'), 1235), (('s', ' '), 3386), ((' ', 'r'), 402), (('r', 'e'), 1626), (('e', 'v'), 188), (('v', 'i'), 179), (('i', 'v'), 244), (('v', 'e'), 1096), (('e', 'd'), 623), (('d', ' '), 2690), ((' ', 'b'), 1422), (('b', 'y'), 150), ((' ', 't'), 3839), (('t', 'h'), 3279), (('h', 'i'), 1123), ((' ', '!'), 402), (('!', '\n'), 379), (('B', 'u'), 105), (('u', 't'), 596), (('i', 'f'), 197), (('f', ' '), 776), ((' ', 'I'), 761), (('I', ' '), 909), ((' ', 's'), 2087), (('s', 'h'), 474), (('h', 'a'), 1557), (('a', 'k'), 279), (('k', 'e'), 442), (('e', ' '), 6284), (('i', 't'), 882), ((' ', 'u'), 275), 

### Now, given pairs, we can compute the most frequent pair of tokens via:

In [9]:
pair_to_merge = max(pairs, key=pairs.get)
pair_to_merge

('e', ' ')

### 1.4 Write a method which, given a pair_to_merge and the current tokenized_corpus and applies the merge operation to the tokenized_corpus
e.g. if the pair to merge is ('a', 'b') and some entry in tokenized_corpus contains ['c', ..., 'a', 'b'...], it should be mapped to ['c', ..., 'ab', ...]

In [10]:
def merge_pair(pair, tokenized_corpus):
    """Merge the most frequent pair in all tokenized_corpus."""
    new_words = []
    for word in tokenized_corpus:
        new_word = []
        i = 0
        while i < len(word):
            if i < len(word) - 1 and (word[i], word[i + 1]) == pair:
                new_word.append(word[i] + word[i + 1])  # Merge pair
                i += 2  # Skip next character since it's merged
            else:
                new_word.append(word[i])
                i += 1
        new_words.append(new_word)
    return new_words

tokenized_corpus = merge_pair(pair_to_merge, tokenized_corpus)
tokenized_corpus[0]

['H',
 'o',
 'w',
 ' ',
 'w',
 'e',
 'l',
 'l',
 ' ',
 'm',
 'y',
 ' ',
 'c',
 'o',
 'm',
 'f',
 'o',
 'r',
 't',
 ' ',
 'i',
 's',
 ' ',
 'r',
 'e',
 'v',
 'i',
 'v',
 'e',
 'd',
 ' ',
 'b',
 'y',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 '!',
 '\n']

### 1.5 Now write the full byte-pair encoding algorithm which, given an initial corpus (list of str) returns its tokenized version as well as the vocabulary.
This function should take a parameter 'n_merge' indicating the number of merges we will do. 

What is the final length of the vocabulary ?

In [11]:
def byte_pair_encoding(corpus, num_merges: int = 10):
    """Perform BPE on a given corpus."""
    tokenized_corpus = [list(word) for word in corpus]  # Start with character tokens
    vocab = {char: i for i, char in enumerate(set("".join(corpus)))}  # Initial vocab
    
    for _ in range(num_merges):
        pairs = get_pairs(tokenized_corpus)
        if not pairs:
            break
        best_pair = max(pairs, key=pairs.get)
        tokenized_corpus = merge_pair(best_pair, tokenized_corpus)
        new_token = best_pair[0] + best_pair[1]
        vocab[new_token] = len(vocab)  # Assign new token an index
    
    return vocab, tokenized_corpus

vocab, tokenized_corpus = byte_pair_encoding(corpus, num_merges=500)

### 1.6 Now, given the obtained 'vocab' write a 'tokenize_to_str_list' method which applies the tokenization to an input string s, returns the list of tokens as a list of str
Verify your result on dataset['test'][0]

In [12]:
def tokenize_to_str_list(s: str, vocab):
    """Tokenize a given string based on the trained BPE vocabulary."""
    tokens = list(s)  # Start with character tokens
    
    while True:
        pairs = [(tokens[i], tokens[i + 1]) for i in range(len(tokens) - 1)]
        valid_pairs = [pair for pair in pairs if pair[0] + pair[1] in vocab]
        
        if not valid_pairs:
            break
        
        best_pair = max(valid_pairs, key=lambda p: vocab.get(p[0] + p[1], float('-inf')))
        merged_token = best_pair[0] + best_pair[1]
        
        new_tokens = []
        i = 0
        while i < len(tokens):
            if i < len(tokens) - 1 and (tokens[i], tokens[i + 1]) == best_pair:
                new_tokens.append(merged_token)
                i += 2
            else:
                new_tokens.append(tokens[i])
                i += 1
        
        tokens = new_tokens
    
    return tokens    

tokenize_to_str_list(dataset['test'][0]['text'], vocab)

['L',
 'ea',
 've ',
 'me ',
 'a',
 'lo',
 'ne',
 ' ',
 'for ',
 'a ',
 'min',
 'u',
 'te ',
 '.\n']

### 1.7 Now, given the obtained 'vocab' write a 'tokenize' method which applies the tokenization to an input string s, returns the list of integers corresponding to each symbol (the keys of vocab!)
Verify your result on dataset['test'][0]

In [13]:
def tokenize(s: str, vocab):
    tokens = tokenize_to_str_list(s, vocab)
    return [vocab[elt] for elt in tokens]

tokenize(dataset['test'][0]['text'], vocab)

[62, 81, 98, 88, 33, 165, 283, 59, 163, 100, 427, 51, 296, 70]

### => Yay, we are not able to map any input text to list of integers, not too long, not too short, of which representatens are not too rare, and which preserves (at least partially) the structure of the words
### 1.8 What are the current limitations of our implementation of BPE ?

- multiple redundant tokenizations e.g. dog! and dog?
- we need to handle unknown characters to not run into errors later on

## 2. Dataset preprocessing


### 2.1 Tokenize the full training and test splits of the datasets, to obtain test_ids and train_ids lists
A good way to proceed is to the '.map' method see https://huggingface.co/docs/datasets/en/process#map

In [14]:
train_ids = dataset['training'].map(lambda s: {'ids': tokenize(s['text'], vocab)})['ids']
test_ids = dataset['test'].map(lambda s: {'ids': tokenize(s['text'], vocab)})['ids']
train_ids[0]

Map: 100%|██████████| 3859/3859 [00:01<00:00, 3041.16 examples/s]
Map: 100%|██████████| 1072/1072 [00:00<00:00, 3050.86 examples/s]


[379, 322, 112, 247, 230, 66, 85, 168, 244, 22, 105, 231, 162, 127]

### 2.2 We will later on work on **batches** of data as is always the case when training deep learning models.
#### The problem here is that we cannot just stack the 'ids' as they have variable length:

In [15]:
dataloader = DataLoader(train_ids, batch_size=2, shuffle=True) # produces batch by 'collating' individual samples
for elt in dataloader: # will raise an error !
    print(elt) 

[tensor([441,  86]), tensor([216, 216]), tensor([213, 168]), tensor([ 84, 363]), tensor([ 45, 359]), tensor([36, 87]), tensor([119, 322]), tensor([ 55, 560]), tensor([93, 86]), tensor([136, 374]), tensor([207, 146]), tensor([106,  69]), tensor([ 54, 211]), tensor([ 66, 452]), tensor([185, 385]), tensor([258,  70])]


RuntimeError: each element in list of batch should be of equal size

#### To solve this issue, we add a 'padding token' to the vocabulary

In [16]:
if '<PAD>' not in vocab:
    vocab['<PAD>'] = len(vocab)
    pad_id = len(vocab)
else:
    pad_id = len(vocab) - 1

### 2.3 implement a 'collate_batch_fn' which pads the sequences found as input using the pad_id, and returns the non-padding mask
For instance if input contains
[[1, 2, 3], [5, 6, 7, 8]]
you should return a torch tensor of type long [[1, 2, 3, pad_id], [5, 6, 7, 8]] as 'ids' and
a tensor of type long [[1, 1, 1, 0], [1, 1, 1, 1]] as mask


In [17]:
def collate_fn(batch):
    ids = [torch.tensor(item, dtype=torch.long) for item in batch]
    
    # maximum length found in the batch: we'll pad to this length 
    max_len = max(len(seq) for seq in ids)
    
    # Forming the output tensors (empty for now)
    padded_ids = pad_id * torch.ones((len(ids), max_len), dtype=torch.long)
    mask = torch.zeros((len(ids), max_len), dtype=torch.long)

    for i, seq in enumerate(ids):
        length = len(seq)
        padded_ids[i, :length] = seq  # Copy original sequence
        mask[i, :length] = 1  # Mark real tokens as 1

    return {
        'ids': padded_ids,
        'mask': mask
    }

### Verify that this now works:

In [18]:
dataloader = DataLoader(train_ids, batch_size=2, shuffle=True, collate_fn=collate_fn) # produces batch by 'collating' individual samples

for elt in dataloader: 
    print(elt) 

{'ids': tensor([[172, 157, 284,  54,  66, 151, 238,   9, 554, 239, 279,  70, 566, 566,
         566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566, 566],
        [277, 448,  71,   3, 403,  59,   7, 220,  75, 106, 290, 103, 214, 106,
         434, 368,   3, 403,  59, 178, 105, 361, 183,  88,   3, 275, 496, 127]]), 'mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}
{'ids': tensor([[ 41, 133, 159,  19,   8, 220,  59,  70],
        [476, 171, 405,  31, 140, 477,  96, 566]]), 'mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0]])}
{'ids': tensor([[ 17,  72, 380, 146, 140, 232, 249, 237, 442, 523, 162, 390, 137, 188,
         347,  36, 238, 502],
        [287, 506, 185, 453, 324,  66,   9,  64,  64, 412,   6, 130, 163,  56,
         406, 566, 566, 566]]), 'mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1,

### Ok now we have a dataloader which yields batches of padded tokenized texts. We are ready to start implementing the transformers architecture. We start with the attention layer.

## 3. Attention layer
Remember the attention layer takes as input a (batched) sequence of hidden states (shape (B, T, H)), and returns a tensor of same dimension exactly containing the attention values.
We will start with an example. 
Let's assume we have this input x to the layer and a head_size of 4 (head_size is dimension of query/keys/values)

In [19]:
x = torch.randn((2, 7, 16)) # batch  of 2 sequences of 7 hidden states of dimension 16 (B, T, H) = (2, 7, 16)
B, T, H = 2, 7, 16
head_size = 4

### 3.1 Declare key, query and values pytorch modules to produce, from this x, the keys, queries and values. Apply these to get (B, T, head_size) tensors of K, Q, V
### We want the values to have shape (B, T, head_size)
hint: in torch, matrices are represented as bias-less Linear modules https://pytorch.org/docs/stable/generated/torch.nn.Linear.html

In [20]:
key_matrix = nn.Linear(16, head_size, bias=False)
query_matrix = nn.Linear(16, head_size, bias=False)
value_matrix = nn.Linear(16, head_size, bias=False)

k = key_matrix(x)
q = query_matrix(x)
v = value_matrix(x)
k.size(), q.size(), v.size()

(torch.Size([2, 7, 4]), torch.Size([2, 7, 4]), torch.Size([2, 7, 4]))

### 3.2 Q and K are (B, T, head_size) matrices. To compute the attention scores, we just need to compute the batch matrix multiplication of Q and K.transpose(1, 2): this op will do: (B, T, head_size) x (B, head_size, T) - (B, T, T): one (unnormalized) attention score for each pair of tokens. Implement this using torch.matmul.

In [21]:
attention_scores = torch.matmul(q, k.transpose(1, 2))
attention_scores

tensor([[[-3.8208e-01, -6.6470e-02, -9.8356e-02,  1.3089e+00,  1.4015e-03,
           4.7517e-01, -9.5601e-01],
         [-3.1781e-01, -4.5300e-01, -8.0042e-01,  4.7143e-01,  8.5078e-02,
           7.0118e-02, -2.1299e-01],
         [-4.6926e-01, -6.8085e-03, -1.0894e-01, -4.2977e-01,  3.1460e-01,
          -3.8486e-01, -7.1973e-02],
         [-1.5252e+00,  1.7831e-01,  3.7032e-02, -3.6010e-01,  4.6869e-01,
           4.6872e-01,  5.4208e-01],
         [-5.4550e-01, -2.3830e-01, -5.0006e-01, -7.5354e-02,  2.3396e-01,
          -7.7827e-02,  3.8722e-02],
         [-2.1981e-01, -2.1243e-01, -3.9093e-01,  1.4063e-01,  8.8902e-02,
          -2.9983e-02, -9.9398e-02],
         [-4.8447e-01, -1.3916e-01, -3.4478e-01, -4.8812e-01,  2.3998e-01,
          -1.6260e-01,  3.5382e-01]],

        [[-1.3976e+00, -5.7669e-01,  4.4070e-01, -2.5164e-01, -1.2157e+00,
           3.6723e-01,  1.0607e+00],
         [ 1.2372e+00,  1.6251e-01, -4.9181e-01, -2.3032e-01,  1.9205e+00,
          -4.2598e-01, -1.1

### 3.3 Now we need to apply the causal masking so that past tokens do not attend future tokens, but future tokens do attend past tokens. To do this, we create a triangular inferior mask for each element of the batch:

In [22]:
mask = torch.tril(torch.ones(T, T)).expand(B, T, T).bool()
mask

tensor([[[ True, False, False, False, False, False, False],
         [ True,  True, False, False, False, False, False],
         [ True,  True,  True, False, False, False, False],
         [ True,  True,  True,  True, False, False, False],
         [ True,  True,  True,  True,  True, False, False],
         [ True,  True,  True,  True,  True,  True, False],
         [ True,  True,  True,  True,  True,  True,  True]],

        [[ True, False, False, False, False, False, False],
         [ True,  True, False, False, False, False, False],
         [ True,  True,  True, False, False, False, False],
         [ True,  True,  True,  True, False, False, False],
         [ True,  True,  True,  True,  True, False, False],
         [ True,  True,  True,  True,  True,  True, False],
         [ True,  True,  True,  True,  True,  True,  True]]])

### Use this mask to set to -float('inf') the attention_scores where the mask if 0. 

In [23]:
attention_scores[~mask] = -float('inf')
attention_scores

tensor([[[-0.3821,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
         [-0.3178, -0.4530,    -inf,    -inf,    -inf,    -inf,    -inf],
         [-0.4693, -0.0068, -0.1089,    -inf,    -inf,    -inf,    -inf],
         [-1.5252,  0.1783,  0.0370, -0.3601,    -inf,    -inf,    -inf],
         [-0.5455, -0.2383, -0.5001, -0.0754,  0.2340,    -inf,    -inf],
         [-0.2198, -0.2124, -0.3909,  0.1406,  0.0889, -0.0300,    -inf],
         [-0.4845, -0.1392, -0.3448, -0.4881,  0.2400, -0.1626,  0.3538]],

        [[-1.3976,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
         [ 1.2372,  0.1625,    -inf,    -inf,    -inf,    -inf,    -inf],
         [ 0.3874, -0.0169, -0.5161,    -inf,    -inf,    -inf,    -inf],
         [ 0.7296,  1.4199,  0.0923, -1.2964,    -inf,    -inf,    -inf],
         [-0.6125,  0.6582,  1.4963, -1.4848,  0.1884,    -inf,    -inf],
         [ 0.4595,  1.3367, -0.3314, -0.7331,  0.7217, -0.3854,    -inf],
         [ 0.2499,  0.6146, -0.3289,

### 3.4 Apply the softmax to normalize row-wise the masked attention scores. Why did we set attention scores to -float('inf') outside of the mask ?

In [24]:
attention_scores = torch.softmax(attention_scores, dim=1)
attention_scores

tensor([[[0.1605, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1712, 0.1202, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1471, 0.1877, 0.2284, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0512, 0.2259, 0.2643, 0.2058, 0.0000, 0.0000, 0.0000],
         [0.1363, 0.1489, 0.1545, 0.2736, 0.3483, 0.0000, 0.0000],
         [0.1888, 0.1528, 0.1723, 0.3395, 0.3013, 0.5331, 0.0000],
         [0.1449, 0.1645, 0.1804, 0.1811, 0.3504, 0.4669, 1.0000]],

        [[0.0232, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3236, 0.0847, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1383, 0.0708, 0.0786, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1948, 0.2980, 0.1444, 0.1441, 0.0000, 0.0000, 0.0000],
         [0.0509, 0.1391, 0.5878, 0.1193, 0.2751, 0.0000, 0.0000],
         [0.1487, 0.2742, 0.0945, 0.2530, 0.4689, 0.4481, 0.0000],
         [0.1205, 0.1332, 0.0947, 0.4836, 0.2560, 0.5519, 1.0000]]],
       grad_fn=<SoftmaxBackward0>)

### 3.5 With another matrix multiplication between the attention_scores (B, T, T) and the values (B, T, head_size), obtain the values as as (B, T, head_size) tensor.

In [25]:
values = torch.matmul(attention_scores, v)
values.size()

torch.Size([2, 7, 4])

### Now gather it all to create an Attention nn.Module object implementing this attention operation.

In [26]:
class Attention(nn.Module):
    """ one head of self-attention """
    def __init__(self, hidden_state_dim, head_size):
        super().__init__()
        self.key = nn.Linear(hidden_state_dim, head_size, bias=False)
        self.query = nn.Linear(hidden_state_dim, head_size, bias=False)
        self.value = nn.Linear(hidden_state_dim, head_size, bias=False)

    def forward(self, x):
        B,T,H = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        v = self.value(x)
        # compute attention scores ("affinities")
        attention_scores = torch.matmul(q, k.transpose(1, 2))
        mask = torch.tril(torch.ones(T, T)).expand(B, T, T).bool()
        attention_scores[~mask] = -float('inf')
        attention_scores = torch.softmax(attention_scores, dim=1)
        # perform the weighted aggregation of the values
        values = torch.matmul(attention_scores, v)
        return values
    
attention = Attention(hidden_state_dim=16, head_size = 4)

### 3.6 Now create a multi-head attention layer that, given some x, computes n_head attention in parallel, each with a head size of (hidden_state_dim / n_heads). Each head produces a (B, T, head_size) set of values. The multi head attention layer should concatenate them into a (B, T, head_size*n_heads=hidden_dim) and apply a final 'projection' layer which is just a linear map.
You can assume that n_heads divide hidden_state_dim.

In [27]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, hidden_state_dim, n_heads):
        super().__init__()
        self.heads = nn.ModuleList([Attention(hidden_state_dim, hidden_state_dim // n_heads) for _ in range(n_heads)])
        self.proj = nn.Linear(hidden_state_dim, hidden_state_dim)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        return self.proj(out)
    

mha = MultiHeadAttention(16, 4)
mha(x).size()

torch.Size([2, 7, 16])