# Entrenamiento internet

In [1]:
import os

# Dataset
SOURCE_LANGUAGE = "en"
TARGET_LANGUAGE = "it"
SUBSET = False
PERCENT_SUBSET = 0.1

# Train
EPOCH0 = 0
STEP0 = 0
LR = 1e-5
EPOCHS = 100000
GPUS = 1
GPU_NUMBER = 0
if GPUS > 1:
    BS = 120
else:
    if SUBSET:
        BS = 128
    else:
        BS = 60
print(f"BS: {BS}")
LR_SCHEDULER = False

# Model
MODEL_PATH = f"model"
if os.path.exists(MODEL_PATH):
    files = os.listdir(MODEL_PATH)
    for file in files:
        if "transformer" in file:
            name = file.split(".")[0]
            STEP0 = int(name.split("_")[-1])
            EPOCH0 = int(name.split("_")[-2])
DIM_EMBEDDING = 512
NUM_HEADS = 8
NUM_LAYERS = 6
DROPOUT = 0.1
LABEL_SMOOTHING = 0.1

# Tokenizers
TOKENIZERS_PATH = f"tokenizers"
if not os.path.exists(TOKENIZERS_PATH):
    os.makedirs(TOKENIZERS_PATH)
UNKNOWN_TOKEN = "[UNK]"
PADDING_TOKEN = "[PAD]"
START_OF_SEQUENCE = "[SOS]"
END_OF_SEQUENCE = "[EOS]"

BS: 60


## Device ✔

In [2]:
import torch

if torch.cuda.device_count() > 1 and GPUS > 1:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using {torch.cuda.device_count()} GPUs")
else:
    if torch.cuda.is_available():
        device = torch.device(f"cuda:{GPU_NUMBER}")
        print(f"Using GPU {GPU_NUMBER}")
    else:
        device = torch.device("cpu")
        print("Using CPU")

Using GPU 0


## Carga de los datos ✔

In [3]:
from datasets import load_dataset

if SUBSET:
    dataset_raw = load_dataset('opus_books', f'{SOURCE_LANGUAGE}-{TARGET_LANGUAGE}', split='train')
    len_dataset = len(dataset_raw)
    len_subset = int(len_dataset * PERCENT_SUBSET)
    dataset_raw = load_dataset('opus_books', f'{SOURCE_LANGUAGE}-{TARGET_LANGUAGE}', split=f'train[:{len_subset}]')
else:
    dataset_raw = load_dataset('opus_books', f'{SOURCE_LANGUAGE}-{TARGET_LANGUAGE}', split='train')

len(dataset_raw)

Found cached dataset opus_books (/home/wallabot/.cache/huggingface/datasets/opus_books/en-it/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf)


32332

## Entrenamiento de los tokenizers ✔

In [4]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

In [5]:
def get_all_sentences(dataset):
    all_sentences = []
    for i in range(len(dataset)):
        all_sentences.append(dataset[i]['translation'][SOURCE_LANGUAGE])
        all_sentences.append(dataset[i]['translation'][TARGET_LANGUAGE])
    return all_sentences

In [6]:
tokenizer_source_path = f"{TOKENIZERS_PATH}/tokenizer_{SOURCE_LANGUAGE}.json"

if not os.path.exists(tokenizer_source_path) or STEP0 == 0 or EPOCH0 == 0:
    print(f"Training source tokenizer")
    tokenizer_source = Tokenizer(WordLevel(unk_token=UNKNOWN_TOKEN))
    tokenizer_source.pre_tokenizer = Whitespace()
    trainer = WordLevelTrainer(special_tokens=[UNKNOWN_TOKEN, PADDING_TOKEN, START_OF_SEQUENCE, END_OF_SEQUENCE])
    all_sentences = get_all_sentences(dataset_raw)
    tokenizer_source.train_from_iterator(all_sentences, trainer)
    tokenizer_source.save(tokenizer_source_path)
else:
    tokenizer_source = Tokenizer.from_file(tokenizer_source_path)

Training source tokenizer


In [7]:
tokenizer_target_path = f"{TOKENIZERS_PATH}/tokenizer_{TARGET_LANGUAGE}.json"

if not os.path.exists(tokenizer_target_path) or STEP0 == 0 or EPOCH0 == 0:
    print(f"Training target tokenizer")
    tokenizer_target = Tokenizer(WordLevel(unk_token=UNKNOWN_TOKEN))
    tokenizer_target.pre_tokenizer = Whitespace()
    trainer = WordLevelTrainer(special_tokens=[UNKNOWN_TOKEN, PADDING_TOKEN, START_OF_SEQUENCE, END_OF_SEQUENCE])
    all_sentences = get_all_sentences(dataset_raw)
    tokenizer_target.train_from_iterator(all_sentences, trainer)
    tokenizer_target.save(tokenizer_target_path)
else:
    tokenizer_target = Tokenizer.from_file(tokenizer_target_path)

Training target tokenizer


## Obtención de la lóngitud máxima de las secuencias ✔

In [8]:
max_source_sequence_length = 0
max_target_sequence_length = 0

for i in range(len(dataset_raw)):
    source_sequence_length = len(tokenizer_source.encode(dataset_raw[i]['translation'][SOURCE_LANGUAGE]).ids)
    target_sequence_length = len(tokenizer_target.encode(dataset_raw[i]['translation'][TARGET_LANGUAGE]).ids)
    if source_sequence_length > max_source_sequence_length:
        max_source_sequence_length = source_sequence_length
    if target_sequence_length > max_target_sequence_length:
        max_target_sequence_length = target_sequence_length

max_sequence_len = max(max_source_sequence_length, max_target_sequence_length)
max_sequence_len += 2   # Add 2 for the start and end of sequence tokens

print(f"Max source sequence length: {max_source_sequence_length}")
print(f"Max target sequence length: {max_target_sequence_length}")
print(f"Max sequence length: {max_sequence_len}")

Max source sequence length: 309
Max target sequence length: 274
Max sequence length: 311


## Datasets ✔

### Mask ✔

In [9]:
def create_mask(size):
    mask = torch.triu(torch.ones(1, size, size), diagonal = 1).type(torch.int)
    return mask == 0

### Dataset class ✔

In [11]:
import torch
from torch.utils.data import Dataset

class BilingualDataset(Dataset):
    def __init__(self, dataset, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, max_seq_len) -> None:
        super().__init__()
        
        self.max_seq_len = max_seq_len
        self.dataset = dataset
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        
        # Defining special tokens by using the target language tokenizer
        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id(START_OF_SEQUENCE)], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id(END_OF_SEQUENCE)], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id(PADDING_TOKEN)], dtype=torch.int64)
        self.unk_token = torch.tensor([tokenizer_tgt.token_to_id(UNKNOWN_TOKEN)], dtype=torch.int64)
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, index):
        # Getting the source and target texts from the dataset
        src_target_pair = self.dataset[index]['translation']
        src_text = src_target_pair[self.src_lang]
        tgt_text = src_target_pair[self.tgt_lang]
        
        # Tokenizing source and target texts 
        encoder_input_tokens = self.tokenizer_src.encode(src_text).ids
        decoder_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
        
        # Computing how many padding tokens need to be added to the tokenized texts 
        encoder_num_padding_tokens = self.max_seq_len - len(encoder_input_tokens) - 2 # Subtracting the two '[EOS]' and '[SOS]' special tokens
        decoder_num_padding_tokens = self.max_seq_len - len(decoder_input_tokens) - 1 # Subtracting the '[SOS]' special token
        
        # If the texts exceed the 'seq_len' allowed, it will raise an error. This means that one of the sentences in the pair is too long to be processed
        # given the current sequence length limit (this will be defined in the config dictionary below)
        if encoder_num_padding_tokens < 0 or decoder_num_padding_tokens < 0:
            raise ValueError('Sentence is too long')
         
        # Building the encoder input tensor by combining several elements
        encoder_input = torch.cat(
            [
                self.sos_token, # inserting the '[SOS]' token
                torch.tensor(encoder_input_tokens, dtype = torch.int64), # Inserting the tokenized source text
                self.eos_token, # Inserting the '[EOS]' token
                torch.tensor([self.pad_token] * encoder_num_padding_tokens, dtype = torch.int64) # Addind padding tokens
            ]
        )
        
        # Building the decoder input tensor by combining several elements
        decoder_input = torch.cat(
            [
                self.sos_token, # inserting the '[SOS]' token 
                torch.tensor(decoder_input_tokens, dtype = torch.int64), # Inserting the tokenized target text
                torch.tensor([self.pad_token] * decoder_num_padding_tokens, dtype = torch.int64) # Addind padding tokens
            ]
        )
        
        # Creating a label tensor, the expected output for training the model
        label = torch.cat(
            [
                torch.tensor(decoder_input_tokens, dtype = torch.int64), # Inserting the tokenized target text
                self.eos_token, # Inserting the '[EOS]' token 
                torch.tensor([self.pad_token] * decoder_num_padding_tokens, dtype = torch.int64) # Adding padding tokens
            ]
        )
        
        # Ensuring that the length of each tensor above is equal to the defined 'seq_len'
        assert encoder_input.size(0) == self.max_seq_len
        assert decoder_input.size(0) == self.max_seq_len
        assert label.size(0) == self.max_seq_len

        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input, 
            'decoder_mask': (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & create_mask(decoder_input.size(0)),
            'label': label,
            'src_text': src_text,
            'tgt_text': tgt_text
        }


### Split dataset ✔

In [12]:
from torch.utils.data import random_split

percent_train = 0.99
len_train = int(len(dataset_raw) * percent_train)
len_val = len(dataset_raw) - len_train
train_dataset_raw, validation_dataset_raw = random_split(dataset_raw, [len_train, len_val])

print(f"Len train: {len(train_dataset_raw)}, len validation: {len(validation_dataset_raw)}")

Len train: 32008, len validation: 324


In [13]:
train_dataset = BilingualDataset(train_dataset_raw, tokenizer_source, tokenizer_target, SOURCE_LANGUAGE, TARGET_LANGUAGE, max_sequence_len)
validation_dataset = BilingualDataset(validation_dataset_raw, tokenizer_source, tokenizer_target, SOURCE_LANGUAGE, TARGET_LANGUAGE, max_sequence_len)

## Dataloaders ✔

In [14]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=BS, shuffle=True)
validation_dataloader = DataLoader(validation_dataset, batch_size=BS, shuffle=True)


## Modelo ✔

### Clases de bajo nivel

In [15]:
import torch
import torch.nn as nn
import torch.nn.init as init

class CustomLinear(nn.Module):
    def __init__(self, in_features, out_features):
        super(CustomLinear, self).__init__()
        self.linear = nn.Linear(in_features, out_features)
        init.kaiming_uniform_(self.linear.weight, nonlinearity='relu')
        if self.linear.bias is not None:
            init.zeros_(self.linear.bias)
    
    def forward(self, x):
        return self.linear(x)

class CustomEmbedding(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super(CustomEmbedding, self).__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        init.xavier_uniform_(self.embedding.weight)
    
    def forward(self, x):
        return self.embedding(x)

class Embedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim

        self.embedding = CustomEmbedding(vocab_size, embedding_dim)

    def forward(self, x):
        return self.embedding(x)

class PositionalEncoding(nn.Module):
    def __init__(self, max_sequence_len, embedding_model_dim):
        super().__init__()
        self.embedding_dim = embedding_model_dim
        positional_encoding = torch.zeros(max_sequence_len, self.embedding_dim)
        for pos in range(max_sequence_len):
            for i in range(0, self.embedding_dim, 2):
                positional_encoding[pos, i]     = torch.sin(torch.tensor(pos / (10000 ** ((2 * i) / self.embedding_dim))))
                positional_encoding[pos, i + 1] = torch.cos(torch.tensor(pos / (10000 ** ((2 * (i+1)) / self.embedding_dim))))
        positional_encoding = positional_encoding.unsqueeze(0)
        self.register_buffer('positional_encoding', positional_encoding)

    def forward(self, x):
        x = x * torch.sqrt(torch.tensor(self.embedding_dim))
        sequence_len = x.size(1)
        x = x + self.positional_encoding[:,:sequence_len]
        return x

class ScaledDotProductAttention(nn.Module):
    def __init__(self, dim_embedding):
        super().__init__()
        self.dim_embedding = dim_embedding
    
    def forward(self, query, key, value, mask=None):
        key_trasposed = key.transpose(-1,-2)
        product = torch.matmul(query, key_trasposed)
        scale = product / torch.sqrt(torch.tensor(self.dim_embedding))
        if mask is not None:
            scale = scale.masked_fill(mask == 0, float('-inf'))
        attention_matrix = torch.softmax(scale, dim=-1)
        output = torch.matmul(attention_matrix, value)
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, heads, dim_embedding):
        super().__init__()
        
        self.dim_embedding = dim_embedding
        self.dim_proyection = dim_embedding // heads
        self.heads = heads
        self.proyection_Q = CustomLinear(dim_embedding, dim_embedding)
        self.proyection_K = CustomLinear(dim_embedding, dim_embedding)
        self.proyection_V = CustomLinear(dim_embedding, dim_embedding)
        self.attention = CustomLinear(dim_embedding, dim_embedding)
        self.scaled_dot_product_attention = ScaledDotProductAttention(self.dim_proyection)
    
    def forward(self, Q, K, V, mask=None):
        batch_size = Q.size(0)
        proyection_Q = self.proyection_Q(Q).view(batch_size, -1, self.heads, self.dim_proyection)
        proyection_K = self.proyection_K(K).view(batch_size, -1, self.heads, self.dim_proyection)
        proyection_V = self.proyection_V(V).view(batch_size, -1, self.heads, self.dim_proyection)
        proyection_Q = proyection_Q.transpose(1,2)
        proyection_K = proyection_K.transpose(1,2)
        proyection_V = proyection_V.transpose(1,2)
        scaled_dot_product_attention = self.scaled_dot_product_attention(proyection_Q, proyection_K, proyection_V, mask=mask)
        concat = scaled_dot_product_attention.transpose(1,2).contiguous().view(batch_size, -1, self.dim_embedding)
        output = self.attention(concat)
        return output

class AddAndNorm(nn.Module):
    def __init__(self, dim_embedding):
        super().__init__()
        self.normalization = nn.LayerNorm(dim_embedding)

    def forward(self, x, sublayer):
        return self.normalization(torch.add(x, sublayer))

class FeedForward(nn.Module):
    def __init__(self, dim_embedding, increment=4):
        super().__init__()
        self.feed_forward = nn.Sequential(
            CustomLinear(dim_embedding, dim_embedding*increment),
            nn.ReLU(),
            CustomLinear(dim_embedding*increment, dim_embedding)
        )
    
    def forward(self, x):
        x = self.feed_forward(x)
        return x

class Linear(nn.Module):
    def __init__(self, dim_embedding, vocab_size):
        super().__init__()
        self.linear = CustomLinear(dim_embedding, vocab_size)
        
    def forward(self, x):
        x = self.linear(x)
        return x

class Softmax(nn.Module):
    def __init__(self):
        super().__init__()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x = self.softmax(x)
        return x

class Dropout(torch.nn.Module):
    def __init__(self, p=0.1):
        super().__init__()
        self.p = p

    def forward(self, x):
        if self.training:
            return torch.nn.functional.dropout(x, p=self.p)
        else:
            return x


### Clases de medio nivel

In [16]:
class EncoderLayer(nn.Module):
    def __init__(self, heads, dim_embedding, prob_dropout=0.1):
        super().__init__()
        self.multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.dropout_1 = Dropout(prob_dropout)
        self.add_and_norm_1 = AddAndNorm(dim_embedding)
        self.feed_forward = FeedForward(dim_embedding)
        self.dropout_2 = Dropout(prob_dropout)
        self.add_and_norm_2 = AddAndNorm(dim_embedding)
    
    def forward(self, x):
        multi_head_attention = self.multi_head_attention(x, x, x)
        dropout1 = self.dropout_1(multi_head_attention)
        add_and_norm_1 = self.add_and_norm_1(x, dropout1)
        feed_forward = self.feed_forward(add_and_norm_1)
        dropout2 = self.dropout_2(feed_forward)
        add_and_norm_2 = self.add_and_norm_2(add_and_norm_1, dropout2)
        return add_and_norm_2

class Encoder(nn.Module):
    def __init__(self, heads, dim_embedding, Nx, prob_dropout=0.1):
        super().__init__()
        self.encoder_layers = nn.ModuleList([EncoderLayer(heads, dim_embedding, prob_dropout) for _ in range(Nx)])
    
    def forward(self, x):
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x)
        return x

class TransformerEncoder(nn.Module):
    def __init__(self, vocab_size, dim_embedding, max_sequence_len, heads, Nx, prob_dropout=0.1):
        super().__init__()
        self.input_embedding = Embedding(vocab_size, dim_embedding)
        self.positional_encoding = PositionalEncoding(max_sequence_len, dim_embedding)
        self.encoder = Encoder(heads, dim_embedding, Nx, prob_dropout)
    
    def forward(self, x):
        input_embedding = self.input_embedding(x)
        positional_encoding = self.positional_encoding(input_embedding)
        encoder = self.encoder(positional_encoding)
        return encoder

class DecoderLayer(nn.Module):
    def __init__(self, heads, dim_embedding, prob_dropout=0.1):
        super().__init__()
        self.masked_multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.dropout_1 = Dropout(prob_dropout)
        self.add_and_norm_1 = AddAndNorm(dim_embedding)
        self.encoder_decoder_multi_head_attention = MultiHeadAttention(heads, dim_embedding)
        self.dropout_2 = Dropout(prob_dropout)
        self.add_and_norm_2 = AddAndNorm(dim_embedding)
        self.feed_forward = FeedForward(dim_embedding)
        self.dropout_3 = Dropout(prob_dropout)
        self.add_and_norm_3 = AddAndNorm(dim_embedding)
    
    def forward(self, x, encoder_output, mask=None):
        Q = x
        K = x
        V = x
        masked_multi_head_attention = self.masked_multi_head_attention(Q, K, V, mask=mask)
        dropout1 = self.dropout_1(masked_multi_head_attention)
        add_and_norm_1 = self.add_and_norm_1(dropout1, x)

        Q = add_and_norm_1
        K = encoder_output
        V = encoder_output
        encoder_decoder_multi_head_attention = self.encoder_decoder_multi_head_attention(Q, K, V)
        dropout2 = self.dropout_2(encoder_decoder_multi_head_attention)
        add_and_norm_2 = self.add_and_norm_2(dropout2, add_and_norm_1)

        feed_forward = self.feed_forward(add_and_norm_2)
        dropout3 = self.dropout_3(feed_forward)
        add_and_norm_3 = self.add_and_norm_3(dropout3, add_and_norm_2)

        return add_and_norm_3

class Decoder(nn.Module):
    def __init__(self, heads, dim_embedding, Nx, prob_dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([DecoderLayer(heads, dim_embedding, prob_dropout) for _ in range(Nx)])
    
    def forward(self, x, encoder_output, mask=None):
        for decoder_layer in self.layers:
            x = decoder_layer(x, encoder_output, mask)
        return x

class TransformerDecoder(nn.Module):
    def __init__(self, heads, dim_embedding, Nx, vocab_size, max_sequence_len, prob_dropout=0.1):
        super().__init__()
        self.embedding = Embedding(vocab_size, dim_embedding)
        self.positional_encoding = PositionalEncoding(max_sequence_len, dim_embedding)
        self.decoder = Decoder(heads, dim_embedding, Nx, prob_dropout)
        self.linear = Linear(dim_embedding, vocab_size)
        # self.softmax = Softmax()
    
    def forward(self, x, encoder_output, mask=None):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        x = self.decoder(x, encoder_output, mask)
        x = self.linear(x)
        # x = self.softmax(x)
        return x

class Linear_and_softmax(nn.Module):
    def __init__(self, dim_embedding, vocab_size):
        super().__init__()
        self.linear = CustomLinear(dim_embedding, vocab_size)
        # self.softmax = Softmax()
    
    def forward(self, x):
        x = self.linear(x)
        # x = self.softmax(x)
        return x


### Clase de alto nivel

In [17]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, src_max_seq_len, tgt_max_seq_len, dim_embedding, Nx, heads, prob_dropout=0.1):
        super().__init__()
        self.transformerEncoder = TransformerEncoder(src_vocab_size, dim_embedding, src_max_seq_len, heads, Nx, prob_dropout)
        self.transformerDecoder = TransformerDecoder(heads, dim_embedding, Nx, tgt_vocab_size, tgt_max_seq_len, prob_dropout)
        self.encoder = Encoder(heads, dim_embedding, Nx, prob_dropout)
        self.decoder = Decoder(heads, dim_embedding, Nx, prob_dropout)
        self.sourceEmbedding = Embedding(src_vocab_size, dim_embedding)
        self.targetEmbedding = Embedding(tgt_vocab_size, dim_embedding)
        self.sourcePositional_encoding = PositionalEncoding(src_max_seq_len, dim_embedding)
        self.targetPositional_encoding = PositionalEncoding(tgt_max_seq_len, dim_embedding)
        self.linear = Linear_and_softmax(dim_embedding, tgt_vocab_size)
    
    def encode(self, source):
        embedding = self.sourceEmbedding(source)
        positional_encoding = self.sourcePositional_encoding(embedding)
        encoder_output = self.encoder(positional_encoding)
        return encoder_output
    
    def decode(self, encoder_output, target, target_mask):
        embedding = self.targetEmbedding(target)
        positional_encoding = self.targetPositional_encoding(embedding)
        decoder_output = self.decoder(positional_encoding, encoder_output, target_mask)
        return decoder_output
    
    def projection(self, decoder_output):
        linear_output = self.linear(decoder_output)
        # softmax_output = self.softmax(linear_output)
        return linear_output
    
    def forward(self, source, target, mask=None):
        encoder_output = self.transformerEncoder(source)
        decoder_output = self.transformerDecoder(target, encoder_output, mask)
        return decoder_output


### Transformer

In [18]:
source_vocab_size = tokenizer_source.get_vocab_size()
target_vocab_size = tokenizer_target.get_vocab_size()
src_max_seq_len = max_sequence_len
tgt_max_seq_len = max_sequence_len
dim_embedding = DIM_EMBEDDING
Nx = NUM_LAYERS
heads = NUM_HEADS
prob_dropout = DROPOUT
print(f"source vocab size: {source_vocab_size}, target vocab size: {target_vocab_size}, source max sequence len: {src_max_seq_len}, target max sequence len: {tgt_max_seq_len}, dim_embedding: {dim_embedding}, heads: {heads}, Nx: {Nx}, prob_dropout: {prob_dropout}")

model = Transformer(
    src_vocab_size = source_vocab_size,
    tgt_vocab_size = target_vocab_size,
    src_max_seq_len = src_max_seq_len,
    tgt_max_seq_len = tgt_max_seq_len,
    dim_embedding = dim_embedding,
    Nx = Nx,
    heads = heads,
    prob_dropout = prob_dropout,
)

model.to(device)
print("")

source vocab size: 30000, target vocab size: 30000, source max sequence len: 311, target max sequence len: 311, dim_embedding: 512, heads: 8, Nx: 6, prob_dropout: 0.1





## Entrenamiento

### Optimizador ✔

In [19]:
optimizer = torch.optim.Adam(model.parameters(), lr=LR, betas=(0.9, 0.98), eps=1e-9)

### Función de pérdida ✔

In [20]:
loss_fn = nn.CrossEntropyLoss(
    ignore_index = tokenizer_source.token_to_id(PADDING_TOKEN), 
    label_smoothing = LABEL_SMOOTHING).to(device)


### LR ✔

In [21]:
class Step():
    def __init__(self):
        self.step = 0
    
    def set_step(self, st):
        self.step = st
    
    def get_step(self):
        return int(self.step)

class LearningRate():
    def __init__(self):
        self.lr = 0
    
    def set_lr(self, l_r_):
        self.lr = l_r_
    
    def get_lr(self):
        return self.lr

actual_step = Step()
actual_lr = LearningRate()

In [22]:
import numpy as np

def calculate_lr(step_num, dim_embeding_model=512, warmup_steps=4000):
    step_num += 1e-7 # Avoid division by zero
    step_num += STEP0
    actual_step.set_step(step_num)
    step_num_exp = -0.4
    warmup_steps_exp = -2.6
    dim_embeding_model_exp = -0.1
    lr = np.power(dim_embeding_model, dim_embeding_model_exp) * np.minimum(np.power(step_num, step_num_exp), step_num * np.power(warmup_steps, warmup_steps_exp))
    actual_lr.set_lr(lr)
    return lr

lr_lambda = lambda step: calculate_lr(step, dim_embeding_model=dim_embedding)
if LR_SCHEDULER:
    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

### Validation loop ✔

In [23]:
def greedy_decode(model, source, tokenizer_tgt, max_len, device, bs=1):
    # Retrieving the indices from the start and end of sequences of the target tokens
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')    # Start of Sentence token index (2)
    # eos_idx = tokenizer_tgt.token_to_id('[EOS]')    # End of Sentence token index (3)

    # Computing the output of the encoder for the source sequence
    encoder_output = model.encode(source)
    
    # Initializing the decoder input with the Start of Sentence token
    decoder_input = torch.empty(bs,1).fill_(sos_idx).type_as(source).to(device)
    
    # Looping until the 'max_len', maximum length, is reached
    while True:
        if decoder_input.size(1) == max_len:
            break
            
        # Building a mask for the decoder input
        decoder_mask = create_mask(decoder_input.size(1)).to(device)
        
        # Calculating the output of the decoder
        out = model.decode(encoder_output, decoder_input, decoder_mask)
        
        # Applying the projection layer to get the probabilities for the next token
        prob = model.projection(out[:, -1])

        # Selecting token with the highest probability
        _, next_word = torch.max(prob, dim=1)
        # decoder_input = torch.cat([decoder_input, torch.empty(1,1). type_as(source).fill_(next_word.item()).to(device)], dim=1)
        decoder_input = torch.cat([decoder_input, next_word.unsqueeze(1)], dim=1)
    
    if len(decoder_input.shape) == 1:
        decoder_input = decoder_input.unsqueeze(0)
    elif len(decoder_input.shape) == 3:
        decoder_input = decoder_input.squeeze(0)

    return decoder_input # Sequence of tokens generated by the decoder


In [24]:
from tqdm import tqdm

import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score

def validation_loop(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, num_examples=2):
    model.eval() # Setting model to evaluation mode

    # Calculating the number of batches in the validation dataset
    dataset_size = len(validation_ds.dataset)  # Tamaño total del conjunto de datos
    batch_size = validation_ds.batch_size      # Tamaño del batch
    drop_last = validation_ds.drop_last        # Configuración de drop_last
    num_batches = len(validation_ds)           # Número total de batches

    # Calculating the total number of samples in the validation dataset
    total_samples = batch_size * (num_batches - 1) + min(batch_size, dataset_size % batch_size)

    # If drop_last is False and the dataset size is not divisible by the batch size, we need to add one more batch
    if drop_last and dataset_size % batch_size != 0:
        total_samples -= dataset_size % batch_size

    # Initializing progress bar
    progress_bar = tqdm(range(total_samples), desc = 'Processing validation examples') # Initializing progress bar

    # Initializing lists to store scores
    bleu_scores = []
    meteor_scores = []
    
    # Creating evaluation loop
    with torch.no_grad(): # Ensuring that no gradients are computed during this process
        for batch in validation_ds:
            # Loading input data and masks onto the GPU
            encoder_input = batch['encoder_input'].to(device)
            
            # Applying the 'greedy_decode' function to get the model's output for the source text of the input batch
            num_samples = len(batch['src_text'])
            model_out_bs = greedy_decode(model, encoder_input, tokenizer_tgt, max_len, device, bs=num_samples)

            # Get metrics for every example in the batch
            for i in range(num_samples):
                source_text = batch['src_text'][i]
                target_text = batch['tgt_text'][i]
                model_out_i = model_out_bs[i]
                model_out_text = tokenizer_tgt.decode(model_out_i.detach().cpu().numpy())

                # Calculating metrics
                references = [target_text.split()]
                hypothesis = model_out_text.split()
                bleu_score = sentence_bleu(references, hypothesis)
                meteor_score_value = meteor_score(references, hypothesis)
            
                # Appending scores to lists
                bleu_scores.append(bleu_score)
                meteor_scores.append(meteor_score_value)

                # Calculating mean scores            
                mean_bleu_score = sum(bleu_scores)/len(bleu_scores) # Calculating mean BLEU score
                mean_meteor_score = sum(meteor_scores)/len(meteor_scores) # Calculating mean METEOR score

                # Updating progress bar and printing bleu and meteor scores
                progress_bar.update(1)
                progress_bar.set_postfix({'BLEU': f'{mean_bleu_score:.9f}', 'METEOR': f'{mean_meteor_score:.9f}'})

    # Printing results
    console_width = 80 # Fixed witdh for printed messages
    print('-'*console_width)
    print(f'SOURCE: {source_text}')
    print(f'TARGET: {target_text}')
    print(f'PREDICTED: {model_out_text}')
    print('-'*console_width)


In [25]:
from tqdm import tqdm

for epoch in range(0, EPOCHS):
        
    # Initializing an iterator over the training dataloader
    # We also use tqdm to display a progress bar
    print()
    batch_iterator = tqdm(train_dataloader, desc = f'Processing epoch {epoch:02d}')
    
    # For each batch...
    for batch in batch_iterator:
        model.train() # Train the model
        
        # Loading input data and masks onto the GPU
        encoder_input = batch['encoder_input'].to(device)
        decoder_input = batch['decoder_input'].to(device)
        decoder_mask = batch['decoder_mask'].to(device)
        
        # Running tensors through the Transformer
        encoder_output = model.encode(encoder_input)
        decoder_output = model.decode(encoder_output, decoder_input, decoder_mask)
        proj_output = model.projection(decoder_output)
        
        # Loading the target labels onto the GPU
        label = batch['label'].to(device)
        
        # Computing loss between model's output and true labels
        loss = loss_fn(proj_output.view(-1, tokenizer_target.get_vocab_size()), label.view(-1))
        
        # Updating progress bar, print loss and lr
        batch_iterator.set_postfix({'loss': f'{loss.item():.6f}', 'lr': f'{actual_lr.get_lr():.9f}'})
        
        # Update LR
        
        # Performing backpropagation
        loss.backward()
        
        # Updating parameters based on the gradients
        optimizer.step()
        
        # Clearing the gradients to prepare for the next batch
        optimizer.zero_grad()

        # Update step and LR
        if LR_SCHEDULER:
            lr_scheduler.step()
        
    # We run the 'run_validation' function at the end of each epoch
    # to evaluate model performance
    validation_loop(model, validation_dataloader, tokenizer_source, tokenizer_target, max_sequence_len, device, lambda msg: batch_iterator.write(msg))





Processing epoch 00: 100%|██████████| 534/534 [05:19<00:00,  1.67it/s, loss=7.395373, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:27<00:00,  3.71it/s, BLEU=0.000000000, METEOR=0.000000000] 


--------------------------------------------------------------------------------
SOURCE: 'Really?' said Vronsky, frowning.
TARGET: — Davvero? — disse Vronskij, aggrottando le sopracciglia.
PREDICTED: 
--------------------------------------------------------------------------------



Processing epoch 01: 100%|██████████| 534/534 [05:19<00:00,  1.67it/s, loss=7.143915, lr=0.000000000]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.76it/s, BLEU=0.000000000, METEOR=0.018620699] 


--------------------------------------------------------------------------------
SOURCE: You consider my arms filled and my embraces appropriated?"
TARGET: Voi credete che le mie braccia non sieno più vuote e che i miei baci spettino a un'altra.
PREDICTED: — , , , , , , , , , . .
--------------------------------------------------------------------------------



Processing epoch 02: 100%|██████████| 534/534 [05:20<00:00,  1.67it/s, loss=6.975619, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.75it/s, BLEU=0.000000000, METEOR=0.021832330] 


--------------------------------------------------------------------------------
SOURCE: I did not altogether like to give in, though I did not relish the plunge.
TARGET: Io non volevo rinunziare interamente a un tuffo, benchè non mi sorridesse.
PREDICTED: — E , , , , , , , , , , , , , , , , , , , , , , , , , .
--------------------------------------------------------------------------------



Processing epoch 03: 100%|██████████| 534/534 [05:19<00:00,  1.67it/s, loss=6.824063, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.74it/s, BLEU=0.000000000, METEOR=0.024572653] 


--------------------------------------------------------------------------------
SOURCE: You will think me superstitious,--some superstition I have in my blood, and always had: nevertheless, this is true--true at least it is that I heard what I now relate.
TARGET: Sentite qualcosa di più strano e mi crederete superstizioso. "È certo che ho avuto sempre un po' di superstizione nel sangue; ma assicuratevi che quello che sto per dirvi è vero.
PREDICTED: — Non , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . , , , , , , , , , , , , , . , , , , , , , , , , . , , , , , , , . , , , , , . , , , , , . , , , , , . , , , , . , , , . , , , , . , , , . . , , , , . . , , . , , , . . . , , , , , . . . , , . . , , , . . . , , , , . . . . , , , , , , . . . , , , . . . . . , , , , , . . . . . . . . . . , , , , , . . . . . . . . . . . . . . . .
---------------------------------------------------

Processing epoch 04: 100%|██████████| 534/534 [05:19<00:00,  1.67it/s, loss=6.802623, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.74it/s, BLEU=0.000000000, METEOR=0.029419585] 


--------------------------------------------------------------------------------
SOURCE: 'I don't see that it is a joke, that...' began Levin, but Koznyshev interrupted him.
TARGET: — Io non vedo come questo sia uno scherzo, che... — voleva cominciare Levin, ma Sergej Ivanovic lo interruppe.
PREDICTED: — Ma , , , , , , , , , , , , , , , , , , , , , , , , . , , . . . . . . . . . . .
--------------------------------------------------------------------------------



Processing epoch 05: 100%|██████████| 534/534 [05:19<00:00,  1.67it/s, loss=6.772199, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.74it/s, BLEU=0.000000000, METEOR=0.027948133] 


--------------------------------------------------------------------------------
SOURCE: The two sisters nursed all the six children successfully through the illness, but Kitty's health did not improve, and in Lent the Shcherbatskys went abroad.
TARGET: Tutte e due le sorelle portarono felicemente a guarigione i sei piccoli, ma la salute di Kitty non migliorò, e durante la quaresima gli Šcerbackij partirono per l’estero.
PREDICTED: E , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , 

Processing epoch 06: 100%|██████████| 534/534 [05:19<00:00,  1.67it/s, loss=6.752402, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.73it/s, BLEU=0.000000000, METEOR=0.027620220] 


--------------------------------------------------------------------------------
SOURCE: Karenin sat down on a chair and with a look full of suffering and weariness watched the nurse as she paced the room.
TARGET: Aleksej Aleksandrovic si era seduto su di una sedia e col viso abbattuto e sofferente, guardava la njanja che andava avanti e indietro.
PREDICTED: Aleksej Aleksandrovic e e e e e e e e e e e e e . e . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
--------------------------------------------------------------------------------



Processing epoch 07: 100%|██████████| 534/534 [05:19<00:00,  1.67it/s, loss=6.768605, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.73it/s, BLEU=0.000000000, METEOR=0.028556046] 


--------------------------------------------------------------------------------
SOURCE: At ten o'clock Vronsky returned.
TARGET: Alle dieci venne Vronskij.
PREDICTED: La ’ ’ ’ ’ ’ ’ si si si si si si . . . . .
--------------------------------------------------------------------------------



Processing epoch 08: 100%|██████████| 534/534 [05:19<00:00,  1.67it/s, loss=6.923685, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:27<00:00,  3.72it/s, BLEU=0.000000000, METEOR=0.028195759] 


--------------------------------------------------------------------------------
SOURCE: I had closed my shutter, laid a mat to the door to prevent the snow from blowing in under it, trimmed my fire, and after sitting nearly an hour on the hearth listening to the muffled fury of the tempest, I lit a candle, took down "Marmion," and beginning--
TARGET: Avevo chiuso le imposte e steso una stuoia dalla parte interna della porta, perché la neve non passasse di sotto, e dopo essere stata un'ora accanto al fuoco ascoltando il rumore della tempesta, avevo preso Marmion e m'ero messa a leggere la strofa seguente:
PREDICTED: La mi mi , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , ,

Processing epoch 09: 100%|██████████| 534/534 [05:20<00:00,  1.67it/s, loss=6.777938, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:27<00:00,  3.72it/s, BLEU=0.000000000, METEOR=0.025119673] 


--------------------------------------------------------------------------------
SOURCE: “Well, then,” said I, “you may let them escape; and Providence seems to have awakened them on purpose to save themselves. Now,” says I, “if the rest escape you, it is your fault.”
TARGET: Eccitato da queste parole prese su un dei moschetti che gli aveva dati, e postasi una pistola nella cintura e armati con gli altri due moschetti i suoi due compagni, s’avviò insieme con essi che lo precedevano d’alcuni passi.
PREDICTED: — E , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , 

Processing epoch 10: 100%|██████████| 534/534 [05:20<00:00,  1.67it/s, loss=6.578362, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:27<00:00,  3.72it/s, BLEU=0.000000000, METEOR=0.028544474] 


--------------------------------------------------------------------------------
SOURCE: You will think me superstitious,--some superstition I have in my blood, and always had: nevertheless, this is true--true at least it is that I heard what I now relate.
TARGET: Sentite qualcosa di più strano e mi crederete superstizioso. "È certo che ho avuto sempre un po' di superstizione nel sangue; ma assicuratevi che quello che sto per dirvi è vero.
PREDICTED: " Ma che che che mi mi mi mi mi mi mi mi mi mi mi mi mi mi mi che mi che che che che che che che che che che che che che che che che che che che che che che che che che che che , che , che , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , 

Processing epoch 11: 100%|██████████| 534/534 [05:20<00:00,  1.67it/s, loss=6.381001, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:27<00:00,  3.72it/s, BLEU=0.000000000, METEOR=0.034797936] 


--------------------------------------------------------------------------------
SOURCE: 'Between the potatoes. We too rent a little land.
TARGET: Anche noi teniamo un pezzetto di terra.
PREDICTED: — , , . . . . . . . . . . . .
--------------------------------------------------------------------------------



Processing epoch 12: 100%|██████████| 534/534 [05:19<00:00,  1.67it/s, loss=6.513088, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:27<00:00,  3.72it/s, BLEU=0.000921218, METEOR=0.037130153] 


--------------------------------------------------------------------------------
SOURCE: So it happens with fortune, who shows her power where valour has not prepared to resist her, and thither she turns her forces where she knows that barriers and defences have not been raised to constrain her.
TARGET: Similmente interviene della fortuna: la quale dimonstra la sua potenzia dove non è ordinata virtù a resisterle, e quivi volta li sua impeti, dove la sa che non sono fatti li argini e li ripari a tenerla.
PREDICTED: E la , , , , , , , , , , , , , , , , , la , la , la . , , , . , , . , la . , . , . . . , la . . . . . . . . . , la . . . . . . . . . . . . . . . . . . . . . . . . . . , , la . . . . . . . . . . . . . , . , la . . . . . , , . . . , la . , , , , , , . , la la , , . , la la la la la la la la la la la la la la la la la la la , la , la la la la la la la la la . , la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la la
-----------------------

Processing epoch 13: 100%|██████████| 534/534 [05:20<00:00,  1.67it/s, loss=6.364981, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:27<00:00,  3.72it/s, BLEU=0.000000000, METEOR=0.036197087] 


--------------------------------------------------------------------------------
SOURCE: Directly Levin approached the bath he was shown an experiment which succeeded perfectly.
TARGET: Non appena Levin si fu avvicinato alla vasca da bagno, gli fu offerto subito un esperimento, e l’esperimento riuscì in pieno.
PREDICTED: Levin Levin , Levin , Levin , Levin , ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ ’ Levin . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
--------------------------------------------------------------------------------



Processing epoch 14: 100%|██████████| 534/534 [05:20<00:00,  1.67it/s, loss=6.459440, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.73it/s, BLEU=0.000921218, METEOR=0.040326397] 


--------------------------------------------------------------------------------
SOURCE: Levin was surprised that they disputed about it so long, especially as, when he asked Koznyshev whether he thought that money had been misappropriated, he received the reply:
TARGET: Levin era sorpreso che si discutesse così a lungo di questo, soprattutto perché quando aveva chiesto a Sergej Ivanovic se egli supponeva che le somme fossero state malversate, Sergej Ivanovic aveva risposto:
PREDICTED: E , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , che che che che che che che che che che che che che che che che che che che che che che che che che che che , , , che che che che , , che che che che che , , , , che che che che che , , , , , che che che che che che che che che che che , , , , , , ,

Processing epoch 15: 100%|██████████| 534/534 [05:20<00:00,  1.67it/s, loss=6.122970, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.73it/s, BLEU=0.001067039, METEOR=0.050732385] 


--------------------------------------------------------------------------------
SOURCE: But really what has he done...
TARGET: Ma cosa mai ha fatto?
PREDICTED: Ma è , è è ? ? !
--------------------------------------------------------------------------------



Processing epoch 16: 100%|██████████| 534/534 [05:20<00:00,  1.67it/s, loss=6.341681, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.73it/s, BLEU=0.001268931, METEOR=0.052227726] 


--------------------------------------------------------------------------------
SOURCE: As they were having a chat before dinner, Oblonsky had said to Bartnyansky:
TARGET: Prima di pranzo, messisi a parlare, Stepan Arkad’ic aveva detto a Bartnjanskij:
PREDICTED: Stepan ’ ic , Stepan ’ ic , Stepan ’ ic , Stepan ’ ic , Stepan ’ ic , un ic . , si ic , si ic , si un ic , si un ic , si un ic . , si un ic , si un ic , si si si si si si si si si si si si un ic , si un ic , un un un un ic . , , , , , , , , , , , , , , , , , , , , , , . , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . , , , , , , , , , , , , , , , , , , , , , , , . , , , , , , , , , , , , , , , , , , , , , , , . , , , , , , , , , , , , , , , , , , , , , , , , , , . , , , , , , un ic , , , , , , , , , , , , , , , , , , , , , , , , , , , . , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , , . ,
-------------------------------------------------------------

Processing epoch 17: 100%|██████████| 534/534 [05:19<00:00,  1.67it/s, loss=6.165667, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.73it/s, BLEU=0.001268931, METEOR=0.056350456] 


--------------------------------------------------------------------------------
SOURCE: I was near, and listened to both you and her.
TARGET: "Ero vicino e ho sentito la vostra conversazione.
PREDICTED: e e e e e e e e e e e . e e e e e . e e e e e e e e e e e e e e e e e e e e . e e e e e e e e e e e e e e e e e e e . e e e e e e e e e e e e e e e e e . e e e e e e e e e e . . e e e e e e e e e e e e e e e e e e e e e e e e . . . . . . . . . . . . . .
--------------------------------------------------------------------------------



Processing epoch 18: 100%|██████████| 534/534 [05:19<00:00,  1.67it/s, loss=6.211956, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.73it/s, BLEU=0.001067039, METEOR=0.053918978] 


--------------------------------------------------------------------------------
SOURCE: Then these folks would go and be pirates until the marriage was over.
TARGET: Meglio sparire, in attesa della celebrazione del matrimonio.
PREDICTED: E il ’ era più più , più più più più più più più più . che più più più più . . . . . . più più più più più più più più più più più più più più più più più più più . . . . . . . . più più più più più più più più più più più più più più più più più più più . . . . . . . . . . . . . . . . . . . . . . più più più più più più più più più più più più più più più più più più più più più più più più più più più più più più più più più più più più più più più . . . . . . . . . . . . . . . . . . . . più più più più più più più più più più più più più più più più più più più più più più più più più più più più . . più più più più più più più più più più più più più più più più più più più più . . .
----------------------------------------------------------------

Processing epoch 19: 100%|██████████| 534/534 [05:20<00:00,  1.67it/s, loss=6.264954, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.73it/s, BLEU=0.001067039, METEOR=0.054122787] 


--------------------------------------------------------------------------------
SOURCE: "No, but I thought you would never come. I could not bear to wait in the house for you, especially with this rain and wind."
TARGET: — No, ma mi pareva che non sareste tornato più e non potevo aspettarvi tranquillamente a casa, sopratutto con quest'acqua e con questo vento.
PREDICTED: — Non ho ho ho ho ho ho ho ho ho ho me , ma non me , non me , non me , non me , non me . , non me . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
--------------------------------------------------------------------------------



Processing epoch 20: 100%|██████████| 534/534 [05:20<00:00,  1.67it/s, loss=5.954166, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.73it/s, BLEU=0.000921218, METEOR=0.056208441] 


--------------------------------------------------------------------------------
SOURCE: It's the Judge's song out of PINAFORE - no, I don't mean PINAFORE - I mean - you know what I mean - the other thing, you know.
TARGET: È la canzone del Giudice del «Pinafore»... No, non volevo dire il «Pinafore»... volevo dire... già sapete ciò che volevo dire... quell’altro, sapete.
PREDICTED: Io non ho ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci ci . . .
--------------------------------------------------------------------------------



Processing epoch 21: 100%|██████████| 534/534 [05:19<00:00,  1.67it/s, loss=5.959284, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.73it/s, BLEU=0.000921218, METEOR=0.063135577] 


--------------------------------------------------------------------------------
SOURCE: She came and shook hand with me when she heard that I was her governess; and as I led her in to breakfast, I addressed some phrases to her in her own tongue: she replied briefly at first, but after we were seated at the table, and she had examined me some ten minutes with her large hazel eyes, she suddenly commenced chattering fluently.
TARGET: Nel condurla a colazione le rivolsi alcune parole nella sua lingua, alle quali rispose brevemente, ma dopo, a tavola, mi fissò con i suoi occhietti castani e incominciò a ciarlare.
PREDICTED: Ella mi occhi , mi occhi , e mi occhi a occhi , e mi occhi a occhi , e mi occhi a occhi a occhi , e mi occhi , e mi occhi a occhi a occhi a occhi , e la , e mi occhi a occhi , e mi occhi , e la . a occhi a occhi a occhi . a occhi a occhi a occhi , mi occhi . , mi occhi , mi occhi , mi occhi , mi occhi . , mi . , mi , mi . , mi , mi me . , mi me a . , mi me a . , mi . . 

Processing epoch 22: 100%|██████████| 534/534 [05:20<00:00,  1.67it/s, loss=5.839463, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.73it/s, BLEU=0.000724147, METEOR=0.065975296] 


--------------------------------------------------------------------------------
SOURCE: And to do this we must lower the level of cultivation and give the peasants an interest in its success.
TARGET: Ma per fare ciò, occorre abbassare il livello dell’azienda e interessare i lavoratori alla prosperità di questa.
PREDICTED: E che il che il il che il . il ’ . . . . . . . . . . . . . . . . . . . . .
--------------------------------------------------------------------------------



Processing epoch 23: 100%|██████████| 534/534 [05:20<00:00,  1.66it/s, loss=5.937834, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:26<00:00,  3.73it/s, BLEU=0.001067039, METEOR=0.066425782] 


--------------------------------------------------------------------------------
SOURCE: This pale crescent was "the likeness of a kingly crown;" what it diademed was "the shape which shape had none."
TARGET: Quella pallida aureola era l'emblema di una corona reale e circondava una testa senza corpo.
PREDICTED: La giorno era era era era il , ma il ' era era , ma era . che era era era . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
--------------------------------------------------------------------------------



Processing epoch 24: 100%|██████████| 534/534 [05:20<00:00,  1.67it/s, loss=6.060523, lr=0.000000000]
Processing validation examples: 100%|██████████| 324/324 [01:27<00:00,  3.72it/s, BLEU=0.000654340, METEOR=0.065190568] 


--------------------------------------------------------------------------------
SOURCE: Yet I saw abundance of fowls, but knew not their kinds; neither when I killed them could I tell what was fit for food, and what not.
TARGET: Notai bensì una grande abbondanza di volatili senza conoscerne le specie, e senza poter nemmeno sapere, quando ne ebbi uccisi alcuni, quali fossero buoni per cibarsene e quali no.
PREDICTED: Non non che non non che non che non che non che che che . non che non . che non che non . che non . che non non . che non . che non . che non . che non . che non . . che non . . che non . che non . . . che non . . . che non . . . che non . che non non non non . . . . . . . . . . . . . . che non . . non . . . . . . . . . . . . . . . . non non non non non non non non non non . . . . . . . . . . . . . . . . . . non . . . . non . . . non . . . . . . non . . . non . . . . . . . . . . . . . .
--------------------------------------------------------------------------------



Processing epoch 25:  56%|█████▌    | 297/534 [02:58<02:22,  1.66it/s, loss=5.846130, lr=0.000000000]


KeyboardInterrupt: 