# Inferencia

Una vez hemos entrenado nuestro transformer podemos probar a ver qué tal lo hace

## Transformer

### Clases de bajo nivel

Vamos primero a implementar un transformer con todo el código que hemos usado antes, primero escribimos las funciones de bajo nivel

In [1]:
import torch
import torch.nn as nn
import torch.nn.init as init

class CustomLinear(nn.Module):
    def __init__(self, in_features, out_features):
        super(CustomLinear, self).__init__()
        self.linear = nn.Linear(in_features, out_features)
        init.kaiming_uniform_(self.linear.weight, nonlinearity='relu')
        if self.linear.bias is not None:
            init.zeros_(self.linear.bias)
    
    def forward(self, x):
        return self.linear(x)

class CustomEmbedding(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super(CustomEmbedding, self).__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        init.xavier_uniform_(self.embedding.weight)
    
    def forward(self, x):
        return self.embedding(x)

class Embedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

        self.embedding = CustomEmbedding(vocab_size, embedding_dim)

    def forward(self, x):
        return self.embedding(x)

class PositionalEncoding(nn.Module):
    def __init__(self, max_sequence_len, embedding_model_dim):
        super().__init__()
        self.embedding_dim = embedding_model_dim
        positional_encoding = torch.zeros(max_sequence_len, self.embedding_dim)
        for pos in range(max_sequence_len):
            for i in range(0, self.embedding_dim, 2):
                positional_encoding[pos, i]     = torch.sin(torch.tensor(pos / (10000 ** ((2 * i) / self.embedding_dim))))
                positional_encoding[pos, i + 1] = torch.cos(torch.tensor(pos / (10000 ** ((2 * (i+1)) / self.embedding_dim))))
        positional_encoding = positional_encoding.unsqueeze(0)
        self.register_buffer('positional_encoding', positional_encoding)

    def forward(self, x):
        x = x * torch.sqrt(torch.tensor(self.embedding_dim))
        sequence_len = x.size(1)
        x = x + self.positional_encoding[:,:sequence_len]
        return x

class ScaledDotProductAttention(nn.Module):
    def __init__(self, dim_embedding):
        super().__init__()
        self.dim_embedding = dim_embedding
    
    def forward(self, query, key, value, mask=None):
        key_trasposed = key.transpose(-1,-2)
        product = torch.matmul(query, key_trasposed)
        scale = product / torch.sqrt(torch.tensor(self.dim_embedding))
        if mask is not None:
            scale = scale.masked_fill(mask == 0, float('-inf'))
        attention_matrix = torch.softmax(scale, dim=-1)
        output = torch.matmul(attention_matrix, value)
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, heads, dim_embedding):
        super().__init__()
        
        self.dim_embedding = dim_embedding
        self.dim_proyection = dim_embedding // heads
        self.heads = heads
        self.proyection_Q = CustomLinear(dim_embedding, dim_embedding)
        self.proyection_K = CustomLinear(dim_embedding, dim_embedding)
        self.proyection_V = CustomLinear(dim_embedding, dim_embedding)
        self.scaled_dot_product_attention = ScaledDotProductAttention(self.dim_proyection)
        self.attention = CustomLinear(dim_embedding, dim_embedding)
    
    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        proyection_Q = self.proyection_Q(Q).view(batch_size, -1, self.heads, self.dim_proyection)
        proyection_K = self.proyection_K(K).view(batch_size, -1, self.heads, self.dim_proyection)
        proyection_V = self.proyection_V(V).view(batch_size, -1, self.heads, self.dim_proyection)
        proyection_Q = proyection_Q.transpose(1,2)
        proyection_K = proyection_K.transpose(1,2)
        proyection_V = proyection_V.transpose(1,2)
        scaled_dot_product_attention = self.scaled_dot_product_attention(proyection_Q, proyection_K, proyection_V, mask=mask)
        concat = scaled_dot_product_attention.transpose(1,2).contiguous().view(batch_size, -1, self.dim_embedding)
        output = self.attention(concat)
        return output

class AddAndNorm(nn.Module):
    def __init__(self, dim_embedding):
        super().__init__()
        self.normalization = nn.LayerNorm(dim_embedding)

    def forward(self, x, sublayer):
        return self.normalization(torch.add(x, sublayer))

class FeedForward(nn.Module):
    def __init__(self, dim_embedding, increment=4):
        super().__init__()
        self.feed_forward = nn.Sequential(
            CustomLinear(dim_embedding, dim_embedding*increment),
            nn.ReLU(),
            CustomLinear(dim_embedding*increment, dim_embedding)
        )
    
    def forward(self, x):
        x = self.feed_forward(x)
        return x

class Linear(nn.Module):
    def __init__(self, dim_embedding, vocab_size):
        super().__init__()
        self.linear = CustomLinear(dim_embedding, vocab_size)
        
    def forward(self, x):
        x = self.linear(x)
        return x

class Softmax(nn.Module):
    def __init__(self):
        super().__init__()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.softmax(x)
        return x

class Dropout(torch.nn.Module):
    def __init__(self, p=0.1):
        super().__init__()
        self.p = p

    def forward(self, x):
        if self.training:
            return torch.nn.functional.dropout(x, p=self.p)
        else:
            return x


### Clases de medio nivel

Ahora las clases de medio nivel

In [2]:
class EncoderLayer(nn.Module):
    def __init__(self, heads, dim_embedding, prob_dropout=0.1):
        super().__init__()
        self.multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.dropout_1 = Dropout(prob_dropout)
        self.add_and_norm_1 = AddAndNorm(dim_embedding)
        self.feed_forward = FeedForward(dim_embedding)
        self.dropout_2 = Dropout(prob_dropout)
        self.add_and_norm_2 = AddAndNorm(dim_embedding)
    
    def forward(self, x):
        multi_head_attention = self.multi_head_attention(x, x, x)
        dropout1 = self.dropout_1(multi_head_attention)
        add_and_norm_1 = self.add_and_norm_1(x, dropout1)
        feed_forward = self.feed_forward(add_and_norm_1)
        dropout2 = self.dropout_2(feed_forward)
        add_and_norm_2 = self.add_and_norm_2(add_and_norm_1, dropout2)
        return add_and_norm_2

class Encoder(nn.Module):
    def __init__(self, heads, dim_embedding, Nx, prob_dropout=0.1):
        super().__init__()
        self.encoder_layers = nn.ModuleList([EncoderLayer(heads, dim_embedding, prob_dropout) for _ in range(Nx)])
    
    def forward(self, x):
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x)
        return x

class DecoderLayer(nn.Module):
    def __init__(self, heads, dim_embedding, prob_dropout=0.1):
        super().__init__()
        self.masked_multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.dropout_1 = Dropout(prob_dropout)
        self.add_and_norm_1 = AddAndNorm(dim_embedding)
        self.encoder_decoder_multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.dropout_2 = Dropout(prob_dropout)
        self.add_and_norm_2 = AddAndNorm(dim_embedding)
        self.feed_forward = FeedForward(dim_embedding)
        self.dropout_3 = Dropout(prob_dropout)
        self.add_and_norm_3 = AddAndNorm(dim_embedding)
    
    def forward(self, x, encoder_output, mask=None):
        Q = x
        K = x
        V = x
        masked_multi_head_attention = self.masked_multi_head_attention(Q, K, V, mask=mask)
        dropout1 = self.dropout_1(masked_multi_head_attention)
        add_and_norm_1 = self.add_and_norm_1(dropout1, x)

        Q = add_and_norm_1
        K = encoder_output
        V = encoder_output
        encoder_decoder_multi_head_attention = self.encoder_decoder_multi_head_attention(Q, K, V)
        dropout2 = self.dropout_2(encoder_decoder_multi_head_attention)
        add_and_norm_2 = self.add_and_norm_2(dropout2, add_and_norm_1)

        feed_forward = self.feed_forward(add_and_norm_2)
        dropout3 = self.dropout_3(feed_forward)
        add_and_norm_3 = self.add_and_norm_3(dropout3, add_and_norm_2)

        return add_and_norm_3

class Decoder(nn.Module):
    def __init__(self, heads, dim_embedding, Nx, prob_dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([DecoderLayer(heads, dim_embedding, prob_dropout) for _ in range(Nx)])
    
    def forward(self, x, encoder_output, mask=None):
        for decoder_layer in self.layers:
            x = decoder_layer(x, encoder_output, mask)
        return x

class Linear_and_softmax(nn.Module):
    def __init__(self, dim_embedding, vocab_size):
        super().__init__()
        self.linear = CustomLinear(dim_embedding, vocab_size)
        # self.softmax = Softmax()
    
    def forward(self, x):
        x = self.linear(x)
        # x = self.softmax(x)
        return x


### Clases de alto nivel

Y por último la clase transformer

In [3]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, src_max_seq_len, tgt_max_seq_len, dim_embedding, Nx, heads, prob_dropout=0.1):
        super().__init__()
        self.encoder = Encoder(heads, dim_embedding, Nx, prob_dropout)
        self.decoder = Decoder(heads, dim_embedding, Nx, prob_dropout)
        self.sourceEmbedding = Embedding(src_vocab_size, dim_embedding)
        self.targetEmbedding = Embedding(tgt_vocab_size, dim_embedding)
        self.sourcePositional_encoding = PositionalEncoding(src_max_seq_len, dim_embedding)
        self.targetPositional_encoding = PositionalEncoding(tgt_max_seq_len, dim_embedding)
        self.linear = Linear_and_softmax(dim_embedding, tgt_vocab_size)
    
    def encode(self, source):
        embedding = self.sourceEmbedding(source)
        positional_encoding = self.sourcePositional_encoding(embedding)
        encoder_output = self.encoder(positional_encoding)
        return encoder_output
    
    def decode(self, encoder_output, target, target_mask):
        embedding = self.targetEmbedding(target)
        positional_encoding = self.targetPositional_encoding(embedding)
        decoder_output = self.decoder(positional_encoding, encoder_output, target_mask)
        return decoder_output
    
    def projection(self, decoder_output):
        linear_output = self.linear(decoder_output)
        # softmax_output = self.softmax(linear_output)
        return linear_output
    
    def forward(self, source, target, target_mask):
        # Encode
        embedding_encoder = self.sourceEmbedding(source)
        positional_encoding_encoder = self.sourcePositional_encoding(embedding_encoder)
        encoder_output = self.encoder(positional_encoding_encoder)

        # Decode
        embedding_decoder = self.targetEmbedding(target)
        positional_encoding_decoder = self.targetPositional_encoding(embedding_decoder)
        decoder_output = self.decoder(positional_encoding_decoder, encoder_output, target_mask)

        # Projection
        proj_output = self.linear(decoder_output)
        return proj_output


### Máscara

Creamos la máscara

In [4]:
def create_mask(sequence_len):
    mask = torch.tril(torch.ones((1, sequence_len, sequence_len)))
    return mask

max_secuence_length = 10 + 2
mask = create_mask(max_secuence_length)

### Device

Definimos el device

In [5]:
if torch.cuda.is_available():
    device = torch.device(f"cuda")
    print("Using GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU


### Tokenizadores

Cargamos los tokenizadores

In [6]:
from tokenizers import Tokenizer

tokenizer_source_path = "tokenizers/tokenizer_en.json"
tokenizer_target_path = "tokenizers/tokenizer_es.json"

tokenizer_source = Tokenizer.from_file(tokenizer_source_path)
tokenizer_target = Tokenizer.from_file(tokenizer_target_path)

### Transformer

Y ahora creamos un objeto del transformer

In [7]:
import tiktoken

source_vocab_size = tokenizer_source.get_vocab_size()
target_vocab_size = tokenizer_target.get_vocab_size()
max_sequence_len = 22
src_max_seq_len = max_sequence_len
tgt_max_seq_len = max_sequence_len
dim_embedding = 512
Nx = 6
heads = 8
prob_dropout = 0.1
print(f"source vocab size: {source_vocab_size}, target vocab size: {target_vocab_size}, source max sequence len: {src_max_seq_len}, target max sequence len: {tgt_max_seq_len}, dim_embedding: {dim_embedding}, heads: {heads}, Nx: {Nx}, prob_dropout: {prob_dropout}")

transformer = Transformer(
    src_vocab_size = source_vocab_size,
    tgt_vocab_size = target_vocab_size,
    src_max_seq_len = src_max_seq_len,
    tgt_max_seq_len = tgt_max_seq_len,
    dim_embedding = dim_embedding,
    Nx = Nx,
    heads = heads,
    prob_dropout = prob_dropout,
)

transformer.to(device)
print(f"Modelo de {(sum(p.numel() for p in transformer.parameters())/1e6):.2f} millones de parámetros")


source vocab size: 30000, target vocab size: 30000, source max sequence len: 22, target max sequence len: 22, dim_embedding: 512, heads: 8, Nx: 6, prob_dropout: 0.1


Modelo de 90.25 millones de parámetros


## Funciones de encoder y decoder

Primero definimos los tokens especiales

In [8]:
unknow_token = '[UNK]'
unknow_token = tokenizer_source.token_to_id(unknow_token)

padding_token = '[PAD]'
padding_token = tokenizer_source.token_to_id(padding_token)

start_token = '[SOS]'
start_token = tokenizer_source.token_to_id(start_token)

end_token = '[EOS]'
end_token = tokenizer_source.token_to_id(end_token)
unknow_token, start_token, end_token

print(f"unknow_token: {unknow_token}, padding_token: {padding_token}, start_token: {start_token}, end_token: {end_token}")


unknow_token: 0, padding_token: 1, start_token: 2, end_token: 3


Ahora definimos las funciones para codificar y decodificar sentencias

In [9]:
def encode_sentence(sentence, tokenizer, start_token, end_token, pad_token, max_length, device):
    # start with SOS
    encode_sentence_list = [start_token]

    # encode sentence
    encode_sentence = tokenizer.encode(sentence)

    # Add to list
    encode_sentence_list.extend(encode_sentence.ids)

    # end with EOS
    encode_sentence_list = encode_sentence_list + [end_token]

    # if the sentence is less than max_length, we add padding tokens
    if len(encode_sentence_list) < max_length:
        encode_sentence_list = encode_sentence_list + [pad_token] * (max_length - len(encode_sentence_list))
    
    # if the sentence is greater than max_length, we truncate
    else:
        encode_sentence_list = encode_sentence_list[:max_length]
    
    # convert to tensor
    encode_sentence_tensor = torch.tensor([encode_sentence_list]).to(device)

    return encode_sentence, encode_sentence_list, encode_sentence_tensor

def decode_sentence(tokens, tokenizer, start_token, end_token):
    # Decode raw sentence
    decode_raw_sentence = tokenizer.decode(tokens)

    # Remove padding tokens
    tokens = [token for token in tokens if token != padding_token]

    # Remove SOS and EOS tokens
    tokens = [token for token in tokens if token != start_token and token != end_token]

    # Decode
    decode_sentence = tokenizer.decode(tokens)

    return decode_raw_sentence, decode_sentence


Probamos las funciones con una sentencia en inglés

In [10]:
sentence_en = "I have learned a lot from this course"
encode_sentence_en, encode_sentence_list_en, encode_sentence_tensor_en = encode_sentence(
    sentence_en,
    tokenizer_source,
    start_token,
    end_token,
    padding_token,
    src_max_seq_len,
    device
)

print(f"Encode english sentence: {encode_sentence_en.ids}")
print(f"Encode english sentence with SOS, EOS and padding: {encode_sentence_list_en}")
print(f"English sencence shape: {encode_sentence_tensor_en.shape}")

decode_raw_sentence_en, decode_sentence_en = decode_sentence(
    encode_sentence_list_en,
    tokenizer_source,
    start_token,
    end_token
)
print(f"\nDecode raw english encoded sentence: {decode_raw_sentence_en}")
print(f"Decode english encoded sentence: {decode_sentence_en}")

Encode english sentence: [11, 32, 1999, 12, 372, 45, 29, 320]
Encode english sentence with SOS, EOS and padding: [2, 11, 32, 1999, 12, 372, 45, 29, 320, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
English sencence shape: torch.Size([1, 22])

Decode raw english encoded sentence: I have learned a lot from this course
Decode english encoded sentence: I have learned a lot from this course


Y ahora con una sentencia en español

In [11]:
sentence_es = "He aprendido un montón con este curso"
encode_sentence_es, encode_sentence_list_es, encode_sentence_tensor_es = encode_sentence(
    sentence_es,
    tokenizer_target,
    start_token,
    end_token,
    padding_token,
    tgt_max_seq_len,
    device
)

print(f"Encode spanish sentence: {encode_sentence_es.ids}")
print(f"Encode spanish sentence with SOS, EOS and padding: {encode_sentence_list_es}")
print(f"Spanish sencence shape: {encode_sentence_tensor_es.shape}")

decode_raw_sentence_es, decode_sentence_es = decode_sentence(
    encode_sentence_list_es,
    tokenizer_target,
    start_token,
    end_token
)
print(f"\nDecode raw spanish encoded sentence: {decode_raw_sentence_es}")
print(f"Decode spanish encoded sentence: {decode_sentence_es}")

Encode spanish sentence: [417, 4915, 19, 1847, 26, 60, 1199]
Encode spanish sentence with SOS, EOS and padding: [2, 417, 4915, 19, 1847, 26, 60, 1199, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Spanish sencence shape: torch.Size([1, 22])

Decode raw spanish encoded sentence: He aprendido un montón con este curso
Decode spanish encoded sentence: He aprendido un montón con este curso


## Función de generación de tokens

In [12]:
def greedy_generate(source, start_token, end_token, model, device, max_len):
    # Output of transformer encoder
    encoder_output = model.encode(source)

    # Input to transformer decoder
    decoder_input = torch.empty(1,1).fill_(start_token).type_as(source).to(device)
    
    # Looping until the 'max_len' is reached or the End of Sentence token is generated
    while True:
        if decoder_input.size(1) == max_len:
            break
            
        # Building a mask for the decoder
        decoder_mask = create_mask(decoder_input.size(1)).to(device)
        
        # Calculating the output of the decoder
        decoder_output = model.decode(encoder_output, decoder_input, decoder_mask)
        
        # Applying the projection layer to get the probabilities for the next token
        prob = model.projection(decoder_output[:, -1])
        
        # Selecting token with the highest probability
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat([decoder_input, torch.empty(1,1). type_as(source).fill_(next_word.item()).to(device)], dim=1)
        
        # If the next token is an End of Sentence token, we finish the loop
        if next_word == end_token:
            break
            
    return decoder_input.squeeze(0) # Sequence of tokens generated by the decoder


## Inferencia con el modelo sin entrenar

Creamos la sentencia que queremos traducir y la tokenizamos

In [13]:
sentence_en = "I have learned a lot from this course"
encode_sentence_en, encode_sentence_list_en, encode_sentence_tensor_en = encode_sentence(
    sentence_en,
    tokenizer_source,
    start_token,
    end_token,
    padding_token,
    src_max_seq_len,
    device
)

print(f"English sentence: {sentence_en}")
print(f"Encode english sentence: {encode_sentence_list_es}")

English sentence: I have learned a lot from this course
Encode english sentence: [2, 417, 4915, 19, 1847, 26, 60, 1199, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Pasamos la secuencia por el modelo

In [14]:
encode_translated = greedy_generate(
    encode_sentence_tensor_en,
    start_token,
    end_token,
    transformer,
    device,
    max_secuence_length
).detach().cpu().numpy().tolist()


print(f"Encode translated sentence: {encode_translated}")

Encode translated sentence: [2, 12510, 22301, 12510, 26848, 12510, 24650, 22301, 12510, 24650, 22301, 26848]


Decodificamos la salida del modelo

In [15]:
decode_raw_sentence_es, decode_sentence_es = decode_sentence(
    encode_translated,
    tokenizer_target,
    start_token,
    end_token
)

print(f"Translated sentence: {decode_raw_sentence_es}")

Translated sentence: banca Criterios banca desplazada banca paces Criterios banca paces Criterios desplazada


Como vemos, con el modelo sin entrenar obtenemos algo que no tiene sentido

## Inferencia con modelo entrenado

Cargamos los pesos

In [16]:
weights = "model/transformer_137_0.pth"
transformer = torch.load(weights, map_location='cpu')

if isinstance(transformer, nn.DataParallel):
    print("DataParallel")
    transformer = transformer.module

transformer.to(device)
print("")




Creamos la sentencia que queremos traducir y la tokenizamos

In [17]:
sentence_en = "I have learned a lot from this course"
encode_sentence_en, encode_sentence_list_en, encode_sentence_tensor_en = encode_sentence(
    sentence_en,
    tokenizer_source,
    start_token,
    end_token,
    padding_token,
    src_max_seq_len,
    device
)

print(f"English sentence: {sentence_en}")
print(f"Encode english sentence: {encode_sentence_list_es}")

English sentence: I have learned a lot from this course
Encode english sentence: [2, 417, 4915, 19, 1847, 26, 60, 1199, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


Pasamos la secuencia por el modelo

In [18]:
encode_translated = greedy_generate(
    encode_sentence_tensor_en,
    start_token,
    end_token,
    transformer,
    device,
    tgt_max_seq_len
).detach().cpu().numpy().tolist()


print(f"Encode translated sentence: {encode_translated}")

Encode translated sentence: [2, 15801, 142, 126, 316, 4, 3]


Decodificamos la salida del modelo

In [19]:
decode_raw_sentence_es, decode_sentence_es = decode_sentence(
    encode_translated,
    tokenizer_target,
    start_token,
    end_token
)

print(f"Translated sentence: {decode_raw_sentence_es}")

Translated sentence: Aprendí mucho desde entonces .


Ahora tiene mucho más sentido

## Función de traducción

Creamos una función a la que le metemos una sentencia y nos la devuelve traducida

In [20]:
def translator(
        sentence_source,
        tokenizer_source,
        tokenizer_target,
        start_token,
        end_token,
        padding_token,
        src_max_seq_len,
        tgt_max_seq_len,
        model,
        device,
):
    _, _, encode_sentence_tensor_source = encode_sentence(
        sentence_source,
        tokenizer_source,
        start_token,
        end_token,
        padding_token,
        src_max_seq_len,
        device
    )

    encode_translated = greedy_generate(
        encode_sentence_tensor_source,
        start_token,
        end_token,
        model,
        device,
        tgt_max_seq_len
    ).detach().cpu().numpy().tolist()

    decode_raw_sentence_es, _ = decode_sentence(
        encode_translated,
        tokenizer_target,
        start_token,
        end_token
    )

    return decode_raw_sentence_es


Cojo una secuencia cualquiera, cojo un comentario de la función `greedy_decode` del notebook de entrenamiento

In [21]:
sentence = "Building a mask for the decoder input"
translator(sentence,
           tokenizer_source,
           tokenizer_target,
           start_token,
           end_token,
           padding_token,
           src_max_seq_len,
           tgt_max_seq_len,
           transformer,
           device,
)

'de máscara para construir una máscara'

Aunque no lo ha hecho perfecto, traduce algo con sentido

Aquí hay dos cosas que decir

La primera es que no hemos entrenado el mejor traductor del mundo, hemos cogido un dataset pequeño, del cual nos hemos quedado con una pequeña parte de ese dataset para poder entrenarlo en una GPU, incluso habrá gente que no lo pueda entrenar en su propia GPU. Por lo que sin buenos datos no se obtienen buenos modelos. Pero como he dicho en todo el momento en este curso, el objetivo no ha sido entrenar los mejores modelos, sino que aprendáis los fundamentos del deep learning

Por otro lado, estamos generando la traducción con una generación de tokens llamada `greedy search` la cual no es la mejor, ya que produce salidas muy repetitivas. Veamos un ejemplo

In [22]:
sentence = "Calculating the output of the decoder"
translator(sentence,
           tokenizer_source,
           tokenizer_target,
           start_token,
           end_token,
           padding_token,
           src_max_seq_len,
           tgt_max_seq_len,
           transformer,
           device,
)

'de la salida de la salida de la salida'

Como podemos ver aquí la traducción es muy repetitiva. Cuando usamos LLMs como chatGPT, Llama, etc. lo que suelen hacer para generar tokens son técnicas como temperature, top-k sampling, top-p sampling, etc. que son técnicas que permiten generar tokens de manera más diversa. Pero eso ya se escapa del contenido de este curso, así que te dejo unos enlaces a esas técnicas por si quieres profundizar en ellas

 * [Temperature](https://maximofn.com/hugging-face-transformers/#Sampling-temperature)
 * [Top-k sampling](https://maximofn.com/hugging-face-transformers/#Sampling-top-k)
 * [Top-p sampling](https://maximofn.com/hugging-face-transformers/#Sampling-top-p-(nucleus-sampling))