# Entrenamiento

## Carga de los datos

Primero de todo cargamos los datos a los que le hicimos la limpieza

In [1]:
from datasets import load_from_disk

path = "data/opus100_croped"
opus100_croped = load_from_disk(path)

Volvemos a hacer una inspección rápida de los datos

In [4]:
print(f"dataset keys: {opus100_croped.keys()}")

dataset keys: dict_keys(['test', 'train', 'validation'])


In [5]:
print(f"train length: {len(opus100_croped['train'])},\nvalidation length: {len(opus100_croped['validation'])},\ntest length: {len(opus100_croped['test'])}")

train length: 983138,
validation length: 1963,
test length: 1955


In [7]:
print(f"train keys: {opus100_croped['train'][0].keys()},\nvalidation keys: {opus100_croped['validation'][0].keys()},\ntest keys: {opus100_croped['test'][0].keys()}")

train keys: dict_keys(['translation']),
validation keys: dict_keys(['translation']),
test keys: dict_keys(['translation'])


In [8]:
print(f"train translation keys: {opus100_croped['train'][0]['translation'].keys()},\nvalidation translation keys: {opus100_croped['validation'][0]['translation'].keys()},\ntest translation keys: {opus100_croped['test'][0]['translation'].keys()}")

train translation keys: dict_keys(['en', 'es']),
validation translation keys: dict_keys(['en', 'es']),
test translation keys: dict_keys(['en', 'es'])


In [10]:
import numpy.random as random
idx = random.randint(0, len(opus100_croped["train"]))
opus100_croped["train"][idx]["translation"]

{'en': 'The preparation of the seventh periodic report should draw upon the non-governmental organizations operating in the country.',
 'es': 'Para la preparación del séptimo informe periódico debería recurrirse a las ONG que operan en el país.'}

## Dataset y dataloader

Ahora que hemos visto cómo es el dataset que nos hemos descargado, vamos a crear un dataset y un dataloader de python

Primero creamos la clase `Opus100Dataset`

In [113]:
import torch
from torch.utils.data import Dataset

class Opus100Dataset(Dataset):
    def __init__(self, dataset, source_language, target_language, tokenizer, start_token, end_token, padding_token, max_length):
        self.dataset = dataset
        self.source_language = source_language
        self.target_language = target_language
        self.tokenizer = tokenizer
        self.start_token = start_token
        self.end_token = end_token
        self.padding_token = padding_token
        self.max_length = max_length
    
    def __len__(self):
        return len(self.dataset)
    
    def encode(self, text):
        encoded = self.tokenizer(text)
        encoded = self.start_token + encoded + self.end_token
        encoded = encoded + self.padding_token * (self.max_length - len(encoded)) # Padding
        return torch.tensor(encoded)
    
    def decode(self, tensor, decoder):
        end_token_position = (tensor == self.end_token[0]).nonzero(as_tuple=True)[0]
        encoded_sentence = tensor[1:end_token_position].tolist()
        return decoder(encoded_sentence)
    
    def __getitem__(self, idx):
        source = self.dataset[idx]["translation"][self.source_language]
        source = self.encode(source)

        target = self.dataset[idx]["translation"][self.target_language]
        target = self.encode(target)
        return source, target

Ahora creamos un objeto para `train`, `validation` y `test`

In [114]:
import tiktoken

encoder = tiktoken.get_encoding("cl100k_base")

start_token = chr(1)
start_token = encoder.encode(start_token)

end_token = chr(2)
end_token = encoder.encode(end_token)

padding_token = chr(3)
padding_token = encoder.encode(padding_token)

max_secuence_length = 104 #128

train_dataset = Opus100Dataset(opus100_croped["train"], "en", "es", encoder.encode, start_token, end_token, padding_token, max_secuence_length)
validation_dataset = Opus100Dataset(opus100_croped["validation"], "en", "es", encoder.encode, start_token, end_token, padding_token, max_secuence_length)
test_dataset = Opus100Dataset(opus100_croped["test"], "en", "es", encoder.encode, start_token, end_token, padding_token, max_secuence_length)

Vamos a ver si las longitudes coinciden

In [115]:
print(f"len opus100 train {len(opus100_croped['train'])}, len train_dataset {len(train_dataset)}")
print(f"len opus100 validation {len(opus100_croped['validation'])}, len validation_dataset {len(validation_dataset)}")
print(f"len opus100 test {len(opus100_croped['test'])}, len test_dataset {len(test_dataset)}")

len opus100 train 983138, len train_dataset 983138
len opus100 validation 1963, len validation_dataset 1963
len opus100 test 1955, len test_dataset 1955


Veamos ahora si una muestra de entrenamiento coincide

In [125]:
idx = 259178 #random.randint(0, len(train_dataset))

opus100_idx_en = opus100_croped["train"][idx]["translation"]["en"]
opus100_idx_es = opus100_croped["train"][idx]["translation"]["es"]
print(f"opus100_idx_en: {opus100_idx_en}, opus100_idx_es: {opus100_idx_es}")

sample_train_dataset_idx_en, sample_train_dataset_idx_es = train_dataset[idx]
sample_train_dataset_idx_en_sentence = train_dataset.decode(sample_train_dataset_idx_en, encoder.decode)
sample_train_dataset_idx_es_sentence = train_dataset.decode(sample_train_dataset_idx_es, encoder.decode)
print(f"sample train_dataset en: {sample_train_dataset_idx_en_sentence}, sample train_dataset es: {sample_train_dataset_idx_es_sentence}")

opus100_idx_en: So the more we see, the less we know., opus100_idx_es: Así que cuanto más vemos, menos sabemos.
sample train_dataset en: So the more we see, the less we know., sample train_dataset es: Así que cuanto más vemos, menos sabemos.


Vamos a ver si tiene la longitud que debería tener

In [126]:
sample_train_dataset_idx_en.shape

torch.Size([104])

Vamos ahora a ver muestras de validación y test

In [129]:
idx = 1724 #random.randint(0, len(validation_dataset))

opus100_idx_en = opus100_croped["validation"][idx]["translation"]["en"]
opus100_idx_es = opus100_croped["validation"][idx]["translation"]["es"]
print(f"opus100_idx_en: {opus100_idx_en}, opus100_idx_es: {opus100_idx_es}")

sample_validation_dataset_idx_en, sample_validation_dataset_idx_es = validation_dataset[idx]
sample_validation_dataset_idx_en_sentence = validation_dataset.decode(sample_validation_dataset_idx_en, encoder.decode)
sample_validation_dataset_idx_es_sentence = validation_dataset.decode(sample_validation_dataset_idx_es, encoder.decode)
print(f"sample validation_dataset en: {sample_validation_dataset_idx_en_sentence}, sample validation_dataset es: {sample_validation_dataset_idx_es_sentence}")

opus100_idx_en: I want you alive!, opus100_idx_es: ¡Te quiero vivo!
sample validation_dataset en: I want you alive!, sample validation_dataset es: ¡Te quiero vivo!


In [131]:
idx = 1044 #random.randint(0, len(test_dataset))

opus100_idx_en = opus100_croped["test"][idx]["translation"]["en"]
opus100_idx_es = opus100_croped["test"][idx]["translation"]["es"]
print(f"opus100_idx_en: {opus100_idx_en}, opus100_idx_es: {opus100_idx_es}")

sample_test_dataset_idx_en, sample_test_dataset_idx_es = test_dataset[idx]
sample_test_dataset_idx_en_sentence = test_dataset.decode(sample_test_dataset_idx_en, encoder.decode)
sample_test_dataset_idx_es_sentence = test_dataset.decode(sample_test_dataset_idx_es, encoder.decode)
print(f"sample test_dataset en: {sample_test_dataset_idx_en_sentence}, sample test_dataset es: {sample_test_dataset_idx_es_sentence}")

opus100_idx_en: Most likely, this is Michael Farmer., opus100_idx_es: Lo más probable es que sea Michael Farmer.
sample test_dataset en: Most likely, this is Michael Farmer., sample test_dataset es: Lo más probable es que sea Michael Farmer.


Vemos que el dataset está bien creado, ahora creamos el `Dataloader`

In [132]:
from torch.utils.data import DataLoader

BS = 64

train_dataloader = DataLoader(train_dataset, batch_size=BS, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=BS, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BS, shuffle=False)

Vamos a ver una muestra

In [134]:
batch_en, batch_es = next(iter(train_dataloader))
print(f"batch_en shape: {batch_en.shape}, batch_es shape: {batch_es.shape}")

batch_en shape: torch.Size([64, 104]), batch_es shape: torch.Size([64, 104])


## Modelo

Volvemos a escribir todo el código del transformer y creamos un objeto de este

### Clases de bajo nivel

In [136]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F

class Dropout(torch.nn.Module):
    def __init__(self, p=0.1):
        super().__init__()
        self.p = p

    def forward(self, x):
        if self.training:
            return torch.nn.functional.dropout(x, p=self.p)
        else:
            return x
        
class Embedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

    def forward(self, x):
        return self.embedding(x)

class PositionalEncoding(nn.Module):
    def __init__(self, max_sequence_len, embedding_model_dim):
        super().__init__()
        self.embedding_dim = embedding_model_dim

        positional_encoding = torch.zeros(max_sequence_len, self.embedding_dim)
        for pos in range(max_sequence_len):
            for i in range(0, self.embedding_dim, 2):
                positional_encoding[pos, i]     = math.sin(pos / (10000 ** ((2 *     i) / self.embedding_dim)))
                positional_encoding[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i+1)) / self.embedding_dim)))
        positional_encoding = positional_encoding.unsqueeze(0)
        self.register_buffer('positional_encoding', positional_encoding)

    def forward(self, x):
        x = x * math.sqrt(self.embedding_dim)
        sequence_len = x.size(1)
        x = x + self.positional_encoding[:,:sequence_len]
        return x

class ScaledDotProductAttention(nn.Module):
    def __init__(self, dim_embedding):
        super().__init__()
        self.dim_embedding = dim_embedding
    
    def forward(self, key, query, value, mask=None):
        key_trasposed = key.transpose(-1,-2)
        product = torch.matmul(query, key_trasposed)
        scale = product / math.sqrt(self.dim_embedding)
        if mask is not None:
            scale = scale.masked_fill(mask == 0, float('-inf'))
        attention_matrix = torch.nn.functional.softmax(scale, dim=-1)
        output = torch.matmul(attention_matrix, value)
        
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, heads, dim_embedding):
        super().__init__()
        
        self.dim_embedding = dim_embedding
        self.dim_proyection = dim_embedding // heads
        self.heads = heads
        
        self.proyection_Q = nn.Linear(dim_embedding, dim_embedding)
        self.proyection_K = nn.Linear(dim_embedding, dim_embedding)
        self.proyection_V = nn.Linear(dim_embedding, dim_embedding)
        self.attention = nn.Linear(dim_embedding, dim_embedding)

        self.scaled_dot_product_attention = ScaledDotProductAttention(self.dim_proyection)
    
    def forward(self, Q, K, V):
        batch_size = Q.size(0)
        
        proyection_Q = self.proyection_Q(Q).view(batch_size, -1, self.heads, self.dim_proyection)
        proyection_K = self.proyection_K(K).view(batch_size, -1, self.heads, self.dim_proyection)
        proyection_V = self.proyection_V(V).view(batch_size, -1, self.heads, self.dim_proyection)
        
        proyection_Q = proyection_Q.transpose(1,2)
        proyection_K = proyection_K.transpose(1,2)
        proyection_V = proyection_V.transpose(1,2)

        scaled_dot_product_attention = self.scaled_dot_product_attention(proyection_Q, proyection_K, proyection_V)
        
        concat = scaled_dot_product_attention.transpose(1,2).contiguous().view(batch_size, -1, self.dim_embedding)
        
        output = self.attention(concat)
    
        return output

class AddAndNorm(nn.Module):
    def __init__(self, dim_embedding):
        super().__init__()
        self.normalization = nn.LayerNorm(dim_embedding)

    def forward(self, x, sublayer):
        return self.normalization(torch.add(x, sublayer))

class FeedForward(nn.Module):
    def __init__(self, dim_embedding, increment=4):
        super().__init__()
        self.feed_forward = nn.Sequential(
            nn.Linear(dim_embedding, dim_embedding*increment),
            nn.ReLU(),
            nn.Linear(dim_embedding*increment, dim_embedding)
        )
    
    def forward(self, x):
        x = self.feed_forward(x)
        return x

class Linear(nn.Module):
    def __init__(self, dim_embedding, vocab_size):
        super().__init__()
        self.linear = nn.Linear(dim_embedding, vocab_size)
        
    def forward(self, x):
        x = self.linear(x)
        return x

class Softmax(nn.Module):
    def __init__(self):
        super().__init__()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.softmax(x)
        return x

### Clases de medio nivel

In [137]:
class EncoderLayer(nn.Module):
    def __init__(self, heads, dim_embedding, prob_dropout=0.1):
        super().__init__()
        self.multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.dropout_1 = Dropout(prob_dropout)
        self.add_and_norm_1 = AddAndNorm(dim_embedding)
        self.feed_forward = FeedForward(dim_embedding)
        self.dropout_2 = Dropout(prob_dropout)
        self.add_and_norm_2 = AddAndNorm(dim_embedding)
    
    def forward(self, x):
        multi_head_attention = self.multi_head_attention(x, x, x)
        dropout1 = self.dropout_1(multi_head_attention)
        add_and_norm_1 = self.add_and_norm_1(x, dropout1)
        feed_forward = self.feed_forward(add_and_norm_1)
        dropout2 = self.dropout_2(feed_forward)
        add_and_norm_2 = self.add_and_norm_2(add_and_norm_1, dropout2)
        return add_and_norm_2

class DecoderLayer(nn.Module):
    def __init__(self, heads, dim_embedding, prob_dropout=0.1):
        super().__init__()
        self.masked_multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.dropout_1 = Dropout(prob_dropout)
        self.add_and_norm_1 = AddAndNorm(dim_embedding)
        self.encoder_decoder_multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.dropout_2 = Dropout(prob_dropout)
        self.add_and_norm_2 = AddAndNorm(dim_embedding)
        self.feed_forward = FeedForward(dim_embedding)
        self.dropout_3 = Dropout(prob_dropout)
        self.add_and_norm_3 = AddAndNorm(dim_embedding)
    
    def forward(self, x, encoder_output, mask=None):
        masked_multi_head_attention = self.masked_multi_head_attention(x, x, x, mask)
        dropout1 = self.dropout_1(masked_multi_head_attention)
        add_and_norm_1 = self.add_and_norm_1(dropout1, x)
        encoder_decoder_multi_head_attention = self.encoder_decoder_multi_head_attention(add_and_norm_1, encoder_output, encoder_output)
        dropout2 = self.dropout_2(encoder_decoder_multi_head_attention)
        add_and_norm_2 = self.add_and_norm_2(dropout2, add_and_norm_1)
        feed_forward = self.feed_forward(add_and_norm_2)
        dropout3 = self.dropout_3(feed_forward)
        add_and_norm_3 = self.add_and_norm_3(dropout3, add_and_norm_2)
        return add_and_norm_3

class Encoder(nn.Module):
    def __init__(self, heads, dim_embedding, Nx, prob_dropout=0.1):
        super().__init__()
        self.encoder_layers = nn.ModuleList([EncoderLayer(heads, dim_embedding, prob_dropout) for _ in range(Nx)])
    
    def forward(self, x):
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x)
        return x

class Decoder(nn.Module):
    def __init__(self, heads, dim_embedding, Nx, prob_dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([DecoderLayer(heads, dim_embedding, prob_dropout) for _ in range(Nx)])
    
    def forward(self, x, encoder_output, mask=None):
        for decoder_layer in self.layers:
            x = decoder_layer(x, encoder_output, mask)
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, dim_embedding, max_sequence_len, heads, Nx, prob_dropout=0.1):
        super().__init__()
        self.input_embedding = Embedding(vocab_size, dim_embedding)
        self.positional_encoding = PositionalEncoding(max_sequence_len, dim_embedding)
        self.encoder = Encoder(heads, dim_embedding, Nx, prob_dropout)
    
    def forward(self, x):
        input_embedding = self.input_embedding(x)
        positional_encoding = self.positional_encoding(input_embedding)
        encoder = self.encoder(positional_encoding)
        return encoder

class TransformerDecoder(nn.Module):
    def __init__(self, heads, dim_embedding, Nx, vocab_size, max_sequence_len, prob_dropout=0.1):
        super().__init__()
        self.embedding = Embedding(vocab_size, dim_embedding)
        self.positional_encoding = PositionalEncoding(max_sequence_len, dim_embedding)
        self.decoder = Decoder(heads, dim_embedding, Nx, prob_dropout)
        self.linear = Linear(dim_embedding, vocab_size)
        self.softmax = Softmax()
    
    def forward(self, x, encoder_output, mask=None):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        x = self.decoder(x, encoder_output, mask)
        x = self.linear(x)
        x = self.softmax(x)
        return x

### Clase de alto nivel

In [138]:
class Transformer(nn.Module):
    def __init__(self, vocab_size, dim_embedding, max_sequence_len, heads, Nx, prob_dropout=0.1):
        super().__init__()
        self.encoder = TransformerEncoder(vocab_size, dim_embedding, max_sequence_len, heads, Nx, prob_dropout)
        self.decoder = TransformerDecoder(heads, dim_embedding, Nx, vocab_size, max_sequence_len, prob_dropout)
    
    def forward(self, source, target, mask=None):
        encoder_output = self.encoder(source)
        decoder_output = self.decoder(target, encoder_output, mask)
        return decoder_output

### Transformer

In [139]:
def create_mask(sequence_len):
    """
    Args:
        sequence_len: length of sequence
        
    Returns:
        mask matrix
    """
    mask = torch.tril(torch.ones((sequence_len, sequence_len)))
    return mask

In [143]:
vocab_size = encoder.n_vocab
dim_embedding = 512
max_secuence_length = 104 #128
heads = 8
Nx = 6
prob_dropout = 0.1

transformer = Transformer(vocab_size=vocab_size,
                          dim_embedding=dim_embedding,
                          max_sequence_len=max_secuence_length,
                          heads=heads,
                          Nx=Nx,
                          prob_dropout=prob_dropout)

mask = create_mask(max_secuence_length)

Vamos a probar el modelo con una muestra del dataset

In [144]:
idx = 259178 #random.randint(0, len(train_dataset))

sample_train_dataset_idx_en, sample_train_dataset_idx_es = train_dataset[idx]
sample_train_dataset_idx_en_sentence = train_dataset.decode(sample_train_dataset_idx_en, encoder.decode)
sample_train_dataset_idx_es_sentence = train_dataset.decode(sample_train_dataset_idx_es, encoder.decode)
print(f"sample sentence train_dataset en: {sample_train_dataset_idx_en_sentence}")
print(f"sample sentence train_dataset es: {sample_train_dataset_idx_es_sentence}")
print("")
print(f"sample train_dataset en: {sample_train_dataset_idx_en}")
print(f"sample train_dataset es: {sample_train_dataset_idx_es}")

sample sentence train_dataset en: So the more we see, the less we know.
sample sentence train_dataset es: Así que cuanto más vemos, menos sabemos.

sample train_dataset en: tensor([ 189, 4516,  279,  810,  584, 1518,   11,  279, 2753,  584, 1440,   13,
         190,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,
         191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,
         191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,
         191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,
         191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,
         191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,
         191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,  191,
         191,  191,  191,  191,  191,  191,  191,  191])
sample train_dataset es: tensor([  189,  2170,  2483,  1744, 87587, 11158,   348, 15295,    11, 32895,
        19972, 15295,    13

In [145]:
output = transformer(sample_train_dataset_idx_en.unsqueeze(0), sample_train_dataset_idx_es.unsqueeze(0), mask)
output.shape

TypeError: MultiHeadAttention.forward() takes 4 positional arguments but 5 were given