# Encoders mini-project:
## Implementation of a numerical sequence generator

### I. Introduction

As seen in the explanation.ipnyb notebook, enven though decoders as implemented in this repository were introduced as part of a bigger architecture - the *transformer* architecture - they can be used as a standalone architecture for sequences generation. 

In this notebook, we will implement a simple sequence generator and make different tests and observation to illustrate what was said in the explanations. 

### II. Implementation of the model

First, let's import the code from model.py. The containt of this file if precisely what was done in the explanations notebook:

In [265]:
from model import SelfAttention, TransformerBlock, StandaloneDecoderBlock, StandaloneDecoder
import torch
import torch.nn as nn
import torch.functional as F

### III. Configuration

In [266]:
BATCH_SIZE = 16
NUM_EPOCHS = 2
VOCAB_SIZE = 13
EMBED_SIZE = 256
NUM_LAYERS = 4
NUM_HEADS = 4
FORWARD_EXPANSION = 4
LEARNING_RATE = 5e-4
DROPOUT = 0.2
MAX_LENGTH=30
PAD_TOKEN_ID=11


### IV. Creation of the dataset

For this mini-project, we will make the dataset class ourself, using the data from https://gist.github.com/elifiner/cc90fdd387449158829515782936a9a4 

The file text is available in the same folder as this notebook.

In [None]:
from torch.utils.data import Dataset
import random as rd
import numpy as np

class ArithmeticSequenceDataset(Dataset):
    def __init__(self, size=1000, length=6, proportions=(50,50), max_length=30):
        super().__init__()

        self.size = size
        self.length = length
        self.proportions = proportions
        self.max_length = max_length

        self.max_value = self.make_sequences()

        # We could make different choices for the vocabulary, but we will choose to limit as
        # much as possible its size
        self.vocab = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                      '<SEP>',
                      '<PAD>',
                      '<EOS>']

        self.vocab2idx = {c: i for i, c in enumerate(self.vocab)}
        self.idx2vocab = {i: c for c, i in self.vocab2idx.items()}

        # Convert sequences to tokens
        self.tokenized_sequences = [self.tokenize_sequence(seq) for seq in self.sequences]

    def tokenize_sequence(self, sequence):
        """
        Input:
            sequence (list): example: [1, 23, 456]
          
        Returns:
            (list): example: ['1', '<SEP>', '2', '3', '<SEP>', '4', '5', '6', '<EOS>']
        """
        tokens = []
        
        for i, num in enumerate(sequence):
            for digit in str(num):
                tokens.append(digit)
            
            if i < len(sequence) - 1:
                tokens.append('<SEP>')
        
        tokens.append('<EOS>')
        
        return tokens
    
    def detokenize_sequence(self, tokens):
        """
        Input:
            sequence (list): example: ['1', '<SEP>', '2', '3', '<SEP>', '4', '5', '6', '<EOS>']
          
        Returns:
            (list): example: [1, 23, 456]
        """

        numbers = []
        current_number = ""
        
        for token in tokens:
            if token in ['<SEP>', '<EOS>', '<PAD>']:
                if current_number:
                    numbers.append(int(current_number))
                    current_number = ""
                if token == '<EOS>':
                    break
            elif token.isdigit():
                current_number += token
        
        if current_number:
            numbers.append(int(current_number))
            
        return numbers

    def generate_random_arithmetic_sequence(self):
        """
        Makes an array of size length containing the length first values
        for U_n = a + b*n with a and b chosen randomly

        Input:
            length: number of values generated

        Returns:
            (np.array)
        """

        a = rd.randint(0, 5)
        b = rd.randint(1, 5)

        _seq = [] 
        for n in range(self.length):

            _seq.append(a + n*b)

        return np.array(_seq)
    
    def generate_random_geometric_sequence(self):
        """
        Makes an array of size length containing the length first values
        for U_n = a*b^n with a and b chosen randomly

        Input:
            length: number of values generated

        Returns:
            (np.array)
        """

        a = rd.randint(1, 4)
        b = rd.randint(2, 4)

        _seq = [] 
        for n in range(1, self.length+1):

            _seq.append(a*(b**n))

        return np.array(_seq)
    
    def make_sequences(self):
        self.sequences = []

        num_arithmetic = self.size*self.proportions[0]//100
        num_geometric = self.size*self.proportions[1]//100

        max_value = 0

        # Generate the arithmetic sequences:
        for k in range(num_arithmetic):
            generated_seq = self.generate_random_arithmetic_sequence()
            self.sequences.append(generated_seq)

            if generated_seq[-1] > max_value:
                max_value = generated_seq[-1]

        # Generate the geometric sequences:
        for k in range(num_geometric):
            generated_seq = self.generate_random_geometric_sequence()
            self.sequences.append(generated_seq)

            if generated_seq[-1] > max_value:
                max_value = generated_seq[-1]

        self.sequences = np.array(self.sequences)

        return max_value
    
    def pad_sequence(self, tokens):
        """Pad sequence to max_length"""

        if len(tokens) > self.max_length:
            return tokens[:self.max_length]  # Truncate
        
        # Pad
        
        padded = tokens + ['<PAD>'] * (self.max_length - len(tokens))
        return padded
    

    def __getitem__(self, idx):
        sequence = self.sequences[idx]  # [2, 4, 8, 16, 32]
        
        # Context size to decide how much context is needed.
        # In our, case, we need the first 3 numbers + 3 seperators
        context_size = 3  # Minimum to get the pattern
        predict_size = len(sequence) - context_size
        
        # Split context and target
        context = sequence[:context_size]    # [2, 4, 8]
        target = sequence[context_size:]     # [16, 32]
        
        # Tokenize
        context_tokens = self.tokenize_sequence(context.tolist())[:-1] + ['<SEP>'] # ['2', '<SEP>', '4', '<SEP>', '8']
        target_tokens = self.tokenize_sequence(target.tolist()) # ['<SEP>', '8', '<SEP>', '1', '6', '<SEP>', '3', '2', '<EOS>']
        
        full_sequence = context_tokens + target_tokens
        
        # Pad to max_length
        padded_sequence = self.pad_sequence(full_sequence)

        context_len = len(context_tokens)
        
        # Create input and target tensors
        input_ids = torch.tensor([self.vocab2idx[token] for token in padded_sequence[:-1]], dtype=torch.long)
        target_ids = torch.tensor([self.vocab2idx[token] for token in padded_sequence[context_len:]], dtype=torch.long)
        
        # Create loss mask (only predict on target tokens, not context)
        loss_mask = torch.cat([
            torch.zeros(context_len, dtype=torch.float),  # Don't compute loss on context
            torch.ones(len(target_tokens), dtype=torch.float),  # Compute loss on predictions
            torch.zeros(self.max_length - context_len - len(target_tokens), dtype=torch.float)  # Don't compute loss on padding
        ])
        
        return input_ids, target_ids, loss_mask

    
            
    def __len__(self):
        return len(self.sequences)
    
test = ArithmeticSequenceDataset(size=10, max_length=30)
print(test.__getitem__(5))
print(len(test.__getitem__(5)[0]), len(test.__getitem__(5)[2]))

# Problème à résoudre sur le fait que nos target_ids ne font pas tous la même taille que c'est apparemment un problème pour notre dataloader
# Solution = padder après ? Mais du coup un peu difficile de savoir à l'avance combien de padding appliquer pour être bon sur l'ensemble
# du dataset ?

UnboundLocalError: local variable 'input_ids' referenced before assignment

We will also need to implement the associated dataloader

Our target_ids don't have the same dimension, which will be a problem to use the loaders in our training loop. For this reason, we will make a collate function whil will make sure that all of our samples have the same dimensions:

In [268]:
def collate_fn(batch):
    """
    Input: batch = [(input_ids_0, target_ids_0, mask_0), 
                    (input_ids_1, target_ids_1, mask_1), 
                    (input_ids_2, target_ids_2, mask_2)]
    
    Output: (batched_inputs, batched_targets, batched_masks)
    """

    input_ids, target_ids, loss_masks = zip(*batch)

    # Find the biggest length in batch
    max_input_len = max(len(seq) for seq in input_ids)
    max_target_len = max(len(seq) for seq in target_ids)
    max_mask_len = max(len(mask) for mask in loss_masks)

    max_len = max(max_input_len, max_target_len, max_mask_len)

    padded_inputs = []
    padded_targets = []
    padded_masks = []
    
    for inp, tgt, mask in zip(input_ids, target_ids, loss_masks):
        # Padder input_ids
        if len(inp) < max_len:
            pad_length = max_len - len(inp)
            padded_inp = torch.cat([inp, torch.full((pad_length,), PAD_TOKEN_ID, dtype=torch.long)])
        else:
            padded_inp = inp
        padded_inputs.append(padded_inp)
        
        # Padder target_ids  
        if len(tgt) < max_len:
            pad_length = max_len - len(tgt)
            padded_tgt = torch.cat([tgt, torch.full((pad_length,), PAD_TOKEN_ID, dtype=torch.long)])
        else:
            padded_tgt = tgt
        padded_targets.append(padded_tgt)
        
        # Padder loss_masks (avec des 0.0 pour ignorer le padding)
        if len(mask) < max_len:
            pad_length = max_len - len(mask)
            padded_mask = torch.cat([mask, torch.zeros(pad_length, dtype=torch.float)])
        else:
            padded_mask = mask
        padded_masks.append(padded_mask)
    
    # Stack tous les tenseurs paddés
    batched_inputs = torch.stack(padded_inputs)    # Shape: (batch_size, max_len)
    batched_targets = torch.stack(padded_targets)  # Shape: (batch_size, max_len)
    batched_masks = torch.stack(padded_masks)      # Shape: (batch_size, max_len)
    
    return batched_inputs, batched_targets, batched_masks

In [269]:
from torch.utils.data import DataLoader

train_dataset = ArithmeticSequenceDataset(
    size=2000,
    length=6,
    proportions=(50, 50),
    max_length=MAX_LENGTH
)

test_dataset = ArithmeticSequenceDataset(
    size=400,
    length=6,
    proportions=(50, 50),
    max_length=MAX_LENGTH
)


train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    collate_fn=collate_fn
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    collate_fn=collate_fn
)

### V. Training

In [270]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, classification_report

if torch.backends.mps.is_available():
    device = torch.device("mps")  # Apple Silicon GPU
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"USING DEVICE: {device}")

model = StandaloneDecoder(
    trg_vocab_size=VOCAB_SIZE,
    embed_size=EMBED_SIZE,
    num_layers=NUM_LAYERS,
    num_heads=4,
    forward_expansion=FORWARD_EXPANSION,
    dropout=DROPOUT,
    device=device,
    max_length=MAX_LENGTH
)

optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
# criterion = nn.CrossEntropyLoss() à adapter pour de la génération

USING DEVICE: mps


In [274]:
from tqdm import tqdm

for epoch in range(NUM_EPOCHS):
    print(f"\n EPOCH {epoch+1}/{NUM_EPOCHS}")

    #########################
    ##### Train epoch: ######
    #########################
    model.train()

    for batch in tqdm(train_loader, desc="Training"):
        print(batch[0])
    


 EPOCH 1/2


Training: 100%|██████████| 125/125 [00:00<00:00, 751.79it/s]


30 21
len padded 30
30 26
len padded 30
30 23
len padded 30
30 12
len padded 30
30 22
len padded 30
30 12
len padded 30
30 15
len padded 30
30 15
len padded 30
30 23
len padded 30
30 21
len padded 30
30 16
len padded 30
30 23
len padded 30
30 13
len padded 30
30 16
len padded 30
30 22
len padded 30
30 18
len padded 30
tensor([[ 9, 10,  2,  7, 10,  8,  1, 10,  2,  4,  3, 10,  7,  2,  9, 10,  2,  1,
          8,  7, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [ 1,  6, 10,  6,  4, 10,  2,  5,  6, 10,  1,  0,  2,  4, 10,  4,  0,  9,
          6, 10,  1,  6,  3,  8,  4, 12, 11, 11, 11, 11],
        [ 8, 10,  3,  2, 10,  1,  2,  8, 10,  5,  1,  2, 10,  2,  0,  4,  8, 10,
          8,  1,  9,  2, 12, 11, 11, 11, 11, 11, 11, 11],
        [ 3, 10,  4, 10,  5, 10,  6, 10,  7, 10,  8, 12, 11, 11, 11, 11, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [ 4, 10,  1,  6, 10,  6,  4, 10,  2,  5,  6, 10,  1,  0,  2,  4, 10,  4,
          0,  9,  6, 12, 11, 11, 11, 11, 11

Training:   0%|          | 0/125 [00:00<?, ?it/s]

30 16
len padded 30
30 16
len padded 30
30 25
len padded 30
30 15
len padded 30
30 21
len padded 30
30 14
len padded 30
30 26
len padded 30
30 19
len padded 30
30 16
len padded 30
30 17
len padded 30
30 16
len padded 30
30 16
len padded 30
30 16
len padded 30
30 15
len padded 30
30 18
len padded 30
30 25
len padded 30
tensor([[ 5, 10,  8, 10,  1,  1, 10,  1,  4, 10,  1,  7, 10,  2,  0, 12, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [ 5, 10,  8, 10,  1,  1, 10,  1,  4, 10,  1,  7, 10,  2,  0, 12, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [ 1,  2, 10,  4,  8, 10,  1,  9,  2, 10,  7,  6,  8, 10,  3,  0,  7,  2,
         10,  1,  2,  2,  8,  8, 12, 11, 11, 11, 11, 11],
        [ 2, 10,  4, 10,  8, 10,  1,  6, 10,  3,  2, 10,  6,  4, 12, 11, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [ 6, 10,  1,  8, 10,  5,  4, 10,  1,  6,  2, 10,  4,  8,  6, 10,  1,  4,
          5,  8, 12, 11, 11, 11, 11, 11, 11

Training: 100%|██████████| 125/125 [00:00<00:00, 635.05it/s]

tensor([[ 6, 10,  1,  8, 10,  5,  4, 10,  1,  6,  2, 10,  4,  8,  6, 10,  1,  4,
          5,  8, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [ 2, 10,  4, 10,  8, 10,  1,  6, 10,  3,  2, 10,  6,  4, 12, 11, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [ 2, 10,  3, 10,  4, 10,  5, 10,  6, 10,  7, 12, 11, 11, 11, 11, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [ 0, 10,  3, 10,  6, 10,  9, 10,  1,  2, 10,  1,  5, 12, 11, 11, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [ 3, 10,  4, 10,  5, 10,  6, 10,  7, 10,  8, 12, 11, 11, 11, 11, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [ 2, 10,  4, 10,  8, 10,  1,  6, 10,  3,  2, 10,  6,  4, 12, 11, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [ 0, 10,  5, 10,  1,  0, 10,  1,  5, 10,  2,  0, 10,  2,  5, 12, 11, 11,
         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [ 1,  2, 10,  4,  8


