[Transformer From Scratch With PyTorch](https://www.kaggle.com/code/lusfernandotorres/transformer-from-scratch-with-pytorch?scriptVersionId=157489239)

In [1]:
# Importing libraries

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

# Math
import math

# HuggingFace libraries 
from datasets import load_dataset
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.trainers import WordLevelTrainer
from tokenizers.pre_tokenizers import Whitespace

# Pathlib 
from pathlib import Path

# typing
from typing import Any

# Library for progress bars in loops
from tqdm import tqdm

# Importing library of warnings
import warnings

MI_TRANSFORMER = True

In [2]:
MI_TRANSFORMER = False

# Creating Input Embeddings
class InputEmbeddings(nn.Module):
    
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model # Dimension of vectors (512)
        self.vocab_size = vocab_size # Size of the vocabulary
        self.embedding = nn.Embedding(vocab_size, d_model) # PyTorch layer that converts integer indices to dense embeddings
        
    def forward(self, x):
        if MI_TRANSFORMER:
            return self.embedding(x)
        else:
            return self.embedding(x) * math.sqrt(self.d_model) # Normalizing the variance of the embeddings

MI_TRANSFORMER = True

In [3]:
MI_TRANSFORMER = False

if MI_TRANSFORMER:
    class PositionalEncoding(nn.Module):
        def __init__(self, max_sequence_len, embedding_model_dim, dropout=0.1):
            super().__init__()
            self.embedding_dim = embedding_model_dim
            positional_encoding = torch.zeros(max_sequence_len, self.embedding_dim)
            for pos in range(max_sequence_len):
                for i in range(0, self.embedding_dim, 2):
                    positional_encoding[pos, i]     = torch.sin(torch.tensor(pos / (10000 ** ((2 * i) / self.embedding_dim))))
                    positional_encoding[pos, i + 1] = torch.cos(torch.tensor(pos / (10000 ** ((2 * (i+1)) / self.embedding_dim))))
            positional_encoding = positional_encoding.unsqueeze(0)
            self.register_buffer('positional_encoding', positional_encoding)

        def forward(self, x):
            x = x * torch.sqrt(torch.tensor(self.embedding_dim))
            sequence_len = x.size(1)
            x = x + self.positional_encoding[:,:sequence_len]
            return x
else:
    # Creating the Positional Encoding
    class PositionalEncoding(nn.Module):
        
        def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
            super().__init__()
            self.d_model = d_model # Dimensionality of the model
            self.seq_len = seq_len # Maximum sequence length
            self.dropout = nn.Dropout(dropout) # Dropout layer to prevent overfitting
            
            # Creating a positional encoding matrix of shape (seq_len, d_model) filled with zeros
            pe = torch.zeros(seq_len, d_model) 
            
            # Creating a tensor representing positions (0 to seq_len - 1)
            position = torch.arange(0, seq_len, dtype = torch.float).unsqueeze(1) # Transforming 'position' into a 2D tensor['seq_len, 1']
            
            # Creating the division term for the positional encoding formula
            div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
            
            # Apply sine to even indices in pe
            pe[:, 0::2] = torch.sin(position * div_term)
            # Apply cosine to odd indices in pe
            pe[:, 1::2] = torch.cos(position * div_term)
            
            # Adding an extra dimension at the beginning of pe matrix for batch handling
            pe = pe.unsqueeze(0)
            
            # Registering 'pe' as buffer. Buffer is a tensor not considered as a model parameter
            self.register_buffer('pe', pe) 
            
        def forward(self,x):
            # Addind positional encoding to the input tensor X
            x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
            return self.dropout(x) # Dropout for regularization

MI_TRANSFORMER = True

In [4]:
# Creating Layer Normalization
class LayerNormalization(nn.Module):
    
    def __init__(self, eps: float = 10**-6) -> None: # We define epsilon as 0.000001 to avoid division by zero
        super().__init__()
        self.eps = eps
        
        # We define alpha as a trainable parameter and initialize it with ones
        self.alpha = nn.Parameter(torch.ones(1)) # One-dimensional tensor that will be used to scale the input data
        
        # We define bias as a trainable parameter and initialize it with zeros
        self.bias = nn.Parameter(torch.zeros(1)) # One-dimensional tenso that will be added to the input data
        
    def forward(self, x):
        mean = x.mean(dim = -1, keepdim = True) # Computing the mean of the input data. Keeping the number of dimensions unchanged
        std = x.std(dim = -1, keepdim = True) # Computing the standard deviation of the input data. Keeping the number of dimensions unchanged
        
        # Returning the normalized input
        return self.alpha * (x-mean) / (std + self.eps) + self.bias

In [5]:
# Creating Feed Forward Layers
class FeedForwardBlock(nn.Module):
    
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        # First linear transformation
        self.linear_1 = nn.Linear(d_model, d_ff) # W1 & b1
        self.dropout = nn.Dropout(dropout) # Dropout to prevent overfitting
        # Second linear transformation
        self.linear_2 = nn.Linear(d_ff, d_model) # W2 & b2
        
    def forward(self, x):
        # (Batch, seq_len, d_model) --> (batch, seq_len, d_ff) -->(batch, seq_len, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

In [6]:
# Creating the Multi-Head Attention block
class MultiHeadAttentionBlock(nn.Module):
    
    def __init__(self, d_model: int, h: int, dropout: float) -> None: # h = number of heads
        super().__init__()
        self.d_model = d_model
        self.h = h
        
        # We ensure that the dimensions of the model is divisible by the number of heads
        assert d_model % h == 0, 'd_model is not divisible by h'
        
        # d_k is the dimension of each attention head's key, query, and value vectors
        self.d_k = d_model // h # d_k formula, like in the original "Attention Is All You Need" paper
        
        # Defining the weight matrices
        self.w_q = nn.Linear(d_model, d_model) # W_q
        self.w_k = nn.Linear(d_model, d_model) # W_k
        self.w_v = nn.Linear(d_model, d_model) # W_v
        self.w_o = nn.Linear(d_model, d_model) # W_o
        
        self.dropout = nn.Dropout(dropout) # Dropout layer to avoid overfitting
        
    
    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):# mask => When we want certain words to NOT interact with others, we "hide" them
        
        d_k = query.shape[-1] # The last dimension of query, key, and value
        
        # We calculate the Attention(Q,K,V) as in the formula in the image above 
        attention_scores = (query @ key.transpose(-2,-1)) / math.sqrt(d_k) # @ = Matrix multiplication sign in PyTorch
        
        # Before applying the softmax, we apply the mask to hide some interactions between words
        if mask is not None: # If a mask IS defined...
            attention_scores.masked_fill_(mask == 0, -1e9) # Replace each value where mask is equal to 0 by -1e9
        attention_scores = attention_scores.softmax(dim = -1) # Applying softmax
        if dropout is not None: # If a dropout IS defined...
            attention_scores = dropout(attention_scores) # We apply dropout to prevent overfitting
            
        return (attention_scores @ value), attention_scores # Multiply the output matrix by the V matrix, as in the formula
        
    def forward(self, q, k, v, mask): 
        
        query = self.w_q(q) # Q' matrix
        key = self.w_k(k) # K' matrix
        value = self.w_v(v) # V' matrix
        
        
        # Splitting results into smaller matrices for the different heads
        # Splitting embeddings (third dimension) into h parts
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1,2) # Transpose => bring the head to the second dimension
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1,2) # Transpose => bring the head to the second dimension
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1,2) # Transpose => bring the head to the second dimension
        
        # Obtaining the output and the attention scores
        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)
        
        # Obtaining the H matrix
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)
        
        return self.w_o(x) # Multiply the H matrix by the weight matrix W_o, resulting in the MH-A matrix

In [7]:
# Building Residual Connection
class ResidualConnection(nn.Module):
    def __init__(self, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout) # We use a dropout layer to prevent overfitting
        self.norm = LayerNormalization() # We use a normalization layer 
    
    def forward(self, x, sublayer):
        # We normalize the input and add it to the original input 'x'. This creates the residual connection process.
        return x + self.dropout(sublayer(self.norm(x))) 

In [8]:
# Building Encoder Block
class EncoderBlock(nn.Module):
    
    # This block takes in the MultiHeadAttentionBlock and FeedForwardBlock, as well as the dropout rate for the residual connections
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        # Storing the self-attention block and feed-forward block
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)]) # 2 Residual Connections with dropout
        
    def forward(self, x, src_mask):
        # Applying the first residual connection with the self-attention block
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask)) # Three 'x's corresponding to query, key, and value inputs plus source mask
        
        # Applying the second residual connection with the feed-forward block 
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x # Output tensor after applying self-attention and feed-forward layers with residual connections.

In [9]:
# Building Encoder 
# An Encoder can have several Encoder Blocks
class Encoder(nn.Module):
    
    # The Encoder takes in instances of 'EncoderBlock'
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers # Storing the EncoderBlocks
        self.norm = LayerNormalization() # Layer for the normalization of the output of the encoder layers
        
    def forward(self, x, mask):
        # Iterating over each EncoderBlock stored in self.layers
        for layer in self.layers:
            x = layer(x, mask) # Applying each EncoderBlock to the input tensor 'x'
        return self.norm(x) # Normalizing output

In [10]:
# Building Decoder Block
class DecoderBlock(nn.Module):
    
    # The DecoderBlock takes in two MultiHeadAttentionBlock. One is self-attention, while the other is cross-attention.
    # It also takes in the feed-forward block and the dropout rate
    def __init__(self,  self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(3)]) # List of three Residual Connections with dropout rate
        
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        
        # Self-Attention block with query, key, and value plus the target language mask
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        
        # The Cross-Attention block using two 'encoder_ouput's for key and value plus the source language mask. It also takes in 'x' for Decoder queries
        x = self.residual_connections[1](x, lambda x: self.cross_attention_block(x, encoder_output, encoder_output, src_mask))
        
        # Feed-forward block with residual connections
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x

In [11]:
# Building Decoder
# A Decoder can have several Decoder Blocks
class Decoder(nn.Module):
    
    # The Decoder takes in instances of 'DecoderBlock'
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        
        # Storing the 'DecoderBlock's
        self.layers = layers
        self.norm = LayerNormalization() # Layer to normalize the output
        
    def forward(self, x, encoder_output, src_mask, tgt_mask):
        
        # Iterating over each DecoderBlock stored in self.layers
        for layer in self.layers:
            # Applies each DecoderBlock to the input 'x' plus the encoder output and source and target masks
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x) # Returns normalized output

In [12]:
# Buiding Linear Layer
class ProjectionLayer(nn.Module):
    def __init__(self, d_model: int, vocab_size: int) -> None: # Model dimension and the size of the output vocabulary
        super().__init__()
        self.proj = nn.Linear(d_model, vocab_size) # Linear layer for projecting the feature space of 'd_model' to the output space of 'vocab_size'
    def forward(self, x):
        return torch.log_softmax(self.proj(x), dim = -1) # Applying the log Softmax function to the output

In [13]:
# Creating the Transformer Architecture
class Transformer(nn.Module):
    
    # This takes in the encoder and decoder, as well the embeddings for the source and target language.
    # It also takes in the Positional Encoding for the source and target language, as well as the projection layer
    def __init__(self, encoder: Encoder, decoder: Decoder, src_embed: InputEmbeddings, tgt_embed: InputEmbeddings, src_pos: PositionalEncoding, tgt_pos: PositionalEncoding, projection_layer: ProjectionLayer) -> None:
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.src_pos = src_pos
        self.tgt_pos = tgt_pos
        self.projection_layer = projection_layer
        
    # Encoder     
    def encode(self, src, src_mask):
        src = self.src_embed(src) # Applying source embeddings to the input source language
        src = self.src_pos(src) # Applying source positional encoding to the source embeddings
        return self.encoder(src, src_mask) # Returning the source embeddings plus a source mask to prevent attention to certain elements
    
    # Decoder
    def decode(self, encoder_output, src_mask, tgt, tgt_mask):
        tgt = self.tgt_embed(tgt) # Applying target embeddings to the input target language (tgt)
        tgt = self.tgt_pos(tgt) # Applying target positional encoding to the target embeddings
        
        # Returning the target embeddings, the output of the encoder, and both source and target masks
        # The target mask ensures that the model won't 'see' future elements of the sequence
        return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
    
    # Applying Projection Layer with the Softmax function to the Decoder output
    def project(self, x):
        return self.projection_layer(x)

In [14]:
# Building & Initializing Transformer

# Definin function and its parameter, including model dimension, number of encoder and decoder stacks, heads, etc.
def build_transformer(src_vocab_size: int, tgt_vocab_size: int, src_seq_len: int, tgt_seq_len: int, d_model: int = 512, N: int = 6, h: int = 8, dropout: float = 0.1, d_ff: int = 2048) -> Transformer:
    
    # Creating Embedding layers
    src_embed = InputEmbeddings(d_model, src_vocab_size) # Source language (Source Vocabulary to 512-dimensional vectors)
    tgt_embed = InputEmbeddings(d_model, tgt_vocab_size) # Target language (Target Vocabulary to 512-dimensional vectors)
    
    # Creating Positional Encoding layers
    src_pos = PositionalEncoding(d_model, src_seq_len, dropout) # Positional encoding for the source language embeddings
    tgt_pos = PositionalEncoding(d_model, tgt_seq_len, dropout) # Positional encoding for the target language embeddings
    
    # Creating EncoderBlocks
    encoder_blocks = [] # Initial list of empty EncoderBlocks
    for _ in range(N): # Iterating 'N' times to create 'N' EncoderBlocks (N = 6)
        encoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout) # Self-Attention
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout) # FeedForward
        
        # Combine layers into an EncoderBlock
        encoder_block = EncoderBlock(encoder_self_attention_block, feed_forward_block, dropout)
        encoder_blocks.append(encoder_block) # Appending EncoderBlock to the list of EncoderBlocks
        
    # Creating DecoderBlocks
    decoder_blocks = [] # Initial list of empty DecoderBlocks
    for _ in range(N): # Iterating 'N' times to create 'N' DecoderBlocks (N = 6)
        decoder_self_attention_block = MultiHeadAttentionBlock(d_model, h, dropout) # Self-Attention
        decoder_cross_attention_block = MultiHeadAttentionBlock(d_model, h, dropout) # Cross-Attention
        feed_forward_block = FeedForwardBlock(d_model, d_ff, dropout) # FeedForward
        
        # Combining layers into a DecoderBlock
        decoder_block = DecoderBlock(decoder_self_attention_block, decoder_cross_attention_block, feed_forward_block, dropout)
        decoder_blocks.append(decoder_block) # Appending DecoderBlock to the list of DecoderBlocks
        
    # Creating the Encoder and Decoder by using the EncoderBlocks and DecoderBlocks lists
    encoder = Encoder(nn.ModuleList(encoder_blocks))
    decoder = Decoder(nn.ModuleList(decoder_blocks))
    
    # Creating projection layer
    projection_layer = ProjectionLayer(d_model, tgt_vocab_size) # Map the output of Decoder to the Target Vocabulary Space
    
    # Creating the transformer by combining everything above
    transformer = Transformer(encoder, decoder, src_embed, tgt_embed, src_pos, tgt_pos, projection_layer)
    
    # Initialize the parameters
    for p in transformer.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
            
    return transformer # Assembled and initialized Transformer. Ready to be trained and validated!

In [15]:
# Defining Tokenizer
def build_tokenizer(config, ds, lang):
    
    # Crating a file path for the tokenizer 
    tokenizer_path = Path(config['tokenizer_file'].format(lang))
    
    # Checking if Tokenizer already exists
    if not Path.exists(tokenizer_path): 
        
        # If it doesn't exist, we create a new one
        tokenizer = Tokenizer(WordLevel(unk_token = '[UNK]')) # Initializing a new world-level tokenizer
        tokenizer.pre_tokenizer = Whitespace() # We will split the text into tokens based on whitespace
        
        # Creating a trainer for the new tokenizer
        trainer = WordLevelTrainer(special_tokens = ["[UNK]", "[PAD]", 
                                                     "[SOS]", "[EOS]"], min_frequency = 2) # Defining Word Level strategy and special tokens
        
        # Training new tokenizer on sentences from the dataset and language specified 
        tokenizer.train_from_iterator(get_all_sentences(ds, lang), trainer = trainer)
        tokenizer.save(str(tokenizer_path)) # Saving trained tokenizer to the file path specified at the beginning of the function
    else:
        tokenizer = Tokenizer.from_file(str(tokenizer_path)) # If the tokenizer already exist, we load it
    return tokenizer # Returns the loaded tokenizer or the trained tokenizer

In [16]:
# Iterating through dataset to extract the original sentence and its translation 
def get_all_sentences(ds, lang):
    for pair in ds:
        yield pair['translation'][lang]

In [17]:
def get_ds(config):
    
    # Loading the train portion of the OpusBooks dataset.
    # The Language pairs will be defined in the 'config' dictionary we will build later
    ds_raw = load_dataset('opus_books', f'{config["lang_src"]}-{config["lang_tgt"]}', split = 'train') 
    
    # Building or loading tokenizer for both the source and target languages 
    tokenizer_src = build_tokenizer(config, ds_raw, config['lang_src'])
    tokenizer_tgt = build_tokenizer(config, ds_raw, config['lang_tgt'])
    
    # Splitting the dataset for training and validation 
    train_ds_size = int(0.9 * len(ds_raw)) # 90% for training
    val_ds_size = len(ds_raw) - train_ds_size # 10% for validation
    train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size]) # Randomly splitting the dataset
                                    
    # Processing data with the BilingualDataset class, which we will define below
    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
                                    
    # Iterating over the entire dataset and printing the maximum length found in the sentences of both the source and target languages
    max_len_src = 0
    max_len_tgt = 0
    for pair in ds_raw:
        src_ids = tokenizer_src.encode(pair['translation'][config['lang_src']]).ids
        tgt_ids = tokenizer_src.encode(pair['translation'][config['lang_tgt']]).ids
        max_len_src = max(max_len_src, len(src_ids))
        max_len_tgt = max(max_len_tgt, len(tgt_ids))
        
    print(f'Max length of source sentence: {max_len_src}')
    print(f'Max length of target sentence: {max_len_tgt}')
    
    # Creating dataloaders for the training and validadion sets
    # Dataloaders are used to iterate over the dataset in batches during training and validation
    train_dataloader = DataLoader(train_ds, batch_size = config['batch_size'], shuffle = True) # Batch size will be defined in the config dictionary
    val_dataloader = DataLoader(val_ds, batch_size = 1, shuffle = True)
    
    return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt # Returning the DataLoader objects and tokenizers

In [18]:
def casual_mask(size):
        # Creating a square matrix of dimensions 'size x size' filled with ones
        mask = torch.triu(torch.ones(1, size, size), diagonal = 1).type(torch.int)
        return mask == 0

In [19]:
class BilingualDataset(Dataset):
    
    # This takes in the dataset contaning sentence pairs, the tokenizers for target and source languages, and the strings of source and target languages
    # 'seq_len' defines the sequence length for both languages
    def __init__(self, ds, tokenizer_src, tokenizer_tgt, src_lang, tgt_lang, seq_len) -> None:
        super().__init__()
        
        self.seq_len = seq_len
        self.ds = ds
        self.tokenizer_src = tokenizer_src
        self.tokenizer_tgt = tokenizer_tgt
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        
        # Defining special tokens by using the target language tokenizer
        self.sos_token = torch.tensor([tokenizer_tgt.token_to_id("[SOS]")], dtype=torch.int64)
        self.eos_token = torch.tensor([tokenizer_tgt.token_to_id("[EOS]")], dtype=torch.int64)
        self.pad_token = torch.tensor([tokenizer_tgt.token_to_id("[PAD]")], dtype=torch.int64)

        
    # Total number of instances in the dataset (some pairs are larger than others)
    def __len__(self):
        return len(self.ds)
    
    # Using the index to retrive source and target texts
    def __getitem__(self, index: Any) -> Any:
        src_target_pair = self.ds[index]
        src_text = src_target_pair['translation'][self.src_lang]
        tgt_text = src_target_pair['translation'][self.tgt_lang]
        
        # Tokenizing source and target texts 
        enc_input_tokens = self.tokenizer_src.encode(src_text).ids
        dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
        
        # Computing how many padding tokens need to be added to the tokenized texts 
        # Source tokens
        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2 # Subtracting the two '[EOS]' and '[SOS]' special tokens
        # Target tokens
        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # Subtracting the '[SOS]' special token
        
        # If the texts exceed the 'seq_len' allowed, it will raise an error. This means that one of the sentences in the pair is too long to be processed
        # given the current sequence length limit (this will be defined in the config dictionary below)
        if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
            raise ValueError('Sentence is too long')
         
        # Building the encoder input tensor by combining several elements
        encoder_input = torch.cat(
            [
            self.sos_token, # inserting the '[SOS]' token
            torch.tensor(enc_input_tokens, dtype = torch.int64), # Inserting the tokenized source text
            self.eos_token, # Inserting the '[EOS]' token
            torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype = torch.int64) # Addind padding tokens
            ]
        )
        
        # Building the decoder input tensor by combining several elements
        decoder_input = torch.cat(
            [
                self.sos_token, # inserting the '[SOS]' token 
                torch.tensor(dec_input_tokens, dtype = torch.int64), # Inserting the tokenized target text
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64) # Addind padding tokens
            ]
        
        )
        
        # Creating a label tensor, the expected output for training the model
        label = torch.cat(
            [
                torch.tensor(dec_input_tokens, dtype = torch.int64), # Inserting the tokenized target text
                self.eos_token, # Inserting the '[EOS]' token 
                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype = torch.int64) # Adding padding tokens
                
            ]
        )
        
        # Ensuring that the length of each tensor above is equal to the defined 'seq_len'
        assert encoder_input.size(0) == self.seq_len
        assert decoder_input.size(0) == self.seq_len
        assert label.size(0) == self.seq_len
        
        return {
            'encoder_input': encoder_input,
            'decoder_input': decoder_input, 
            'encoder_mask': (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(),
            'decoder_mask': (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask(decoder_input.size(0)), 
            'label': label,
            'src_text': src_text,
            'tgt_text': tgt_text
        }

In [20]:
# Define function to obtain the most probable next token
def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
    # Retrieving the indices from the start and end of sequences of the target tokens
    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
    eos_idx = tokenizer_tgt.token_to_id('[EOS]')
    
    # Computing the output of the encoder for the source sequence
    encoder_output = model.encode(source, source_mask)
    # Initializing the decoder input with the Start of Sentence token
    decoder_input = torch.empty(1,1).fill_(sos_idx).type_as(source).to(device)
    
    # Looping until the 'max_len', maximum length, is reached
    while True:
        if decoder_input.size(1) == max_len:
            break
            
        # Building a mask for the decoder input
        decoder_mask = casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)
        
        # Calculating the output of the decoder
        out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)
        
        # Applying the projection layer to get the probabilities for the next token
        prob = model.project(out[:, -1])
        
        # Selecting token with the highest probability
        _, next_word = torch.max(prob, dim=1)
        decoder_input = torch.cat([decoder_input, torch.empty(1,1). type_as(source).fill_(next_word.item()).to(device)], dim=1)
        
        # If the next token is an End of Sentence token, we finish the loop
        if next_word == eos_idx:
            break
            
    return decoder_input.squeeze(0) # Sequence of tokens generated by the decoder

In [21]:
# Defining function to evaluate the model on the validation dataset
# num_examples = 2, two examples per run
def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_state, writer, num_examples=2):
    model.eval() # Setting model to evaluation mode
    count = 0 # Initializing counter to keep track of how many examples have been processed
    
    console_width = 80 # Fixed witdh for printed messages
    
    # Creating evaluation loop
    with torch.no_grad(): # Ensuring that no gradients are computed during this process
        for batch in validation_ds:
            count += 1
            encoder_input = batch['encoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            
            # Ensuring that the batch_size of the validation set is 1
            assert encoder_input.size(0) ==  1, 'Batch size must be 1 for validation.'
            
            # Applying the 'greedy_decode' function to get the model's output for the source text of the input batch
            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)
            
            # Retrieving source and target texts from the batch
            source_text = batch['src_text'][0]
            target_text = batch['tgt_text'][0] # True translation 
            model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy()) # Decoded, human-readable model output
            
            # Printing results
            print_msg('-'*console_width)
            print_msg(f'SOURCE: {source_text}')
            print_msg(f'TARGET: {target_text}')
            print_msg(f'PREDICTED: {model_out_text}')
            
            # After two examples, we break the loop
            if count == num_examples:
                break

In [22]:
# We pass as parameters the config dictionary, the length of the vocabylary of the source language and the target language
def get_model(config, vocab_src_len, vocab_tgt_len):
    
    # Loading model using the 'build_transformer' function.
    # We will use the lengths of the source language and target language vocabularies, the 'seq_len', and the dimensionality of the embeddings
    model = build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])
    return model

In [23]:
# Define settings for building and training the transformer model
def get_config():
    return{
        'batch_size': 16,
        'num_epochs': 20,
        'lr': 10**-4,
        'seq_len': 350,
        'd_model': 512, # Dimensions of the embeddings in the Transformer. 512 like in the "Attention Is All You Need" paper.
        'lang_src': 'en',
        'lang_tgt': 'it',
        'model_folder': 'weights',
        'model_basename': 'tmodel_',
        'preload': None,
        'tokenizer_file': 'tokenizer_{0}.json',
        'experiment_name': 'runs/tmodel'
    }
    

# Function to construct the path for saving and retrieving model weights
def get_weights_file_path(config, epoch: str):
    model_folder = config['model_folder'] # Extracting model folder from the config
    model_basename = config['model_basename'] # Extracting the base name for model files
    model_filename = f"{model_basename}{epoch}.pt" # Building filename
    return str(Path('.')/ model_folder/ model_filename) # Combining current directory, the model folder, and the model filename

In [24]:
def train_model(config):
    # Setting up device to run on GPU to train faster
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device {device}")
    
    # Creating model directory to store weights
    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)
    
    # Retrieving dataloaders and tokenizers for source and target languages using the 'get_ds' function
    train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
    
    # Initializing model on the GPU using the 'get_model' function
    model = get_model(config,tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
    
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])
    
    # Setting up the Adam optimizer with the specified learning rate from the '
    # config' dictionary plus an epsilon value
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps = 1e-9)
    
    # Initializing epoch and global step variables
    initial_epoch = 0
    global_step = 0
    
    # Checking if there is a pre-trained model to load
    # If true, loads it
    if config['preload']:
        model_filename = get_weights_file_path(config, config['preload'])
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename) # Loading model
        
        # Sets epoch to the saved in the state plus one, to resume from where it stopped
        initial_epoch = state['epoch'] + 1
        # Loading the optimizer state from the saved model
        optimizer.load_state_dict(state['optimizer_state_dict'])
        # Loading the global step state from the saved model
        global_step = state['global_step']
        
    # Initializing CrossEntropyLoss function for training
    # We ignore padding tokens when computing loss, as they are not relevant for the learning process
    # We also apply label_smoothing to prevent overfitting
    loss_fn = nn.CrossEntropyLoss(ignore_index = tokenizer_src.token_to_id('[PAD]'), label_smoothing = 0.1).to(device)
    
    # Initializing training loop 
    
    # Iterating over each epoch from the 'initial_epoch' variable up to
    # the number of epochs informed in the config
    for epoch in range(initial_epoch, config['num_epochs']):
        
        # Initializing an iterator over the training dataloader
        # We also use tqdm to display a progress bar
        batch_iterator = tqdm(train_dataloader, desc = f'Processing epoch {epoch:02d}')
        
        # For each batch...
        for batch in batch_iterator:
            model.train() # Train the model
            
            # Loading input data and masks onto the GPU
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            encoder_mask = batch['encoder_mask'].to(device)
            decoder_mask = batch['decoder_mask'].to(device)
            
            # Running tensors through the Transformer
            encoder_output = model.encode(encoder_input, encoder_mask)
            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask)
            proj_output = model.project(decoder_output)
            
            # Loading the target labels onto the GPU
            label = batch['label'].to(device)
            
            # Computing loss between model's output and true labels
            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
            
            # Updating progress bar
            batch_iterator.set_postfix({f"loss": f"{loss.item():6.3f}"})
            
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()
            
            # Performing backpropagation
            loss.backward()
            
            # Updating parameters based on the gradients
            optimizer.step()
            
            # Clearing the gradients to prepare for the next batch
            optimizer.zero_grad()
            
            global_step += 1 # Updating global step count
            
        # We run the 'run_validation' function at the end of each epoch
        # to evaluate model performance
        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)
         
        # Saving model
        model_filename = get_weights_file_path(config, f'{epoch:02d}')
        # Writting current model state to the 'model_filename'
        torch.save({
            'epoch': epoch, # Current epoch
            'model_state_dict': model.state_dict(),# Current model state
            'optimizer_state_dict': optimizer.state_dict(), # Current optimizer state
            'global_step': global_step # Current global step 
        }, model_filename)

In [25]:
config = get_config() # Retrieving config settings

In [26]:
train_model(config) # Training model with the config arguments

Using device cuda


Found cached dataset opus_books (/home/wallabot/.cache/huggingface/datasets/opus_books/en-it/1.0.0/e8f950a4f32dc39b7f9088908216cd2d7e21ac35f893d04d39eb594746af2daf)


Max length of source sentence: 309
Max length of target sentence: 274


Processing epoch 00: 100%|██████████| 1819/1819 [18:43<00:00,  1.62it/s, loss=6.428]


--------------------------------------------------------------------------------
SOURCE: But for that...'
TARGET: Ma per questo...
PREDICTED: — Sì .
--------------------------------------------------------------------------------
SOURCE: She was standing among that group, very erect as usual, and was talking to the master of the house with her head slightly turned toward him, when Kitty approached.
TARGET: Stava in piedi, tenendosi, come sempre, straordinariamente diritta e quando Kitty si avvicinò al gruppo, parlava col padrone di casa volgendo lieve il capo verso di lui.
PREDICTED: — Ma , , , — disse , — disse , — disse , — disse , — disse , e e e e non .


Processing epoch 01: 100%|██████████| 1819/1819 [18:09<00:00,  1.67it/s, loss=5.532]


--------------------------------------------------------------------------------
SOURCE: But, as I was saying: sitting in that window-seat, do you think of nothing but your future school?
TARGET: Ma quando state alla finestra, pensate forse alla vostra futura scuola?
PREDICTED: E , non era la sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua sua .
--------------------------------------------------------------------------------
SOURCE: '- come to the following: adultery of husband or wife and the detection of the guilty party by mutual consent, or involuntary detection without such consent.
TARGET: — Si riducono ai seguenti: adulterio di uno dei coniugi e prova della colpevolezza di una delle parti risultante da reciproco accordo, oppure, fuori di un tale accor

Processing epoch 02: 100%|██████████| 1819/1819 [18:52<00:00,  1.61it/s, loss=6.195]


--------------------------------------------------------------------------------
SOURCE: Whatever I do with its cage, I cannot get at it--the savage, beautiful creature!
TARGET: Anche se m'impadronissi della gabbia, non potrei trattenere il bell'uccello selvaggio.
PREDICTED: Non non ho detto , ma non mi , e non mi .
--------------------------------------------------------------------------------
SOURCE: Thus it happens that whenever those who are hostile have the opportunity to attack they do it like partisans, whilst the others defend lukewarmly, in such wise that the prince is endangered along with them.
TARGET: Donde nasce che qualunque volta quelli che sono nimici hanno occasione di assaltare, lo fanno partigianamente, e quelli altri defendano tepidamente; in modo che insieme con loro si periclita.
PREDICTED: Non sono , non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non non 

Processing epoch 03: 100%|██████████| 1819/1819 [17:50<00:00,  1.70it/s, loss=5.916]


--------------------------------------------------------------------------------
SOURCE: 'Yes, I am under the painful necessity of applying for a divorce,' he continued.
TARGET: — Sì, sono stato messo nella penosa necessità di esigere il divorzio — egli disse.
PREDICTED: — Non è , ma non è mai nulla , ma non è mai nulla .
--------------------------------------------------------------------------------
SOURCE: 'She particularly wished me to go and see her,' continued Anna. 'I shall be glad to see the old lady again, and will go to-morrow.
TARGET: — Mi ha pregato tanto di andare da lei — continuò Anna — e io sono contenta di vedere quella vecchietta, e domani ci andrò.
PREDICTED: — Non è , ma non è mai mai mai , ma non è mai nulla , ma non è mai un ’ altra , e non è mai nulla .


Processing epoch 04: 100%|██████████| 1819/1819 [17:04<00:00,  1.77it/s, loss=5.916]


--------------------------------------------------------------------------------
SOURCE: Do you accept my solution of the mystery?"
TARGET: Accettate la mia spiegazione?
PREDICTED: E non vi , ma non vi ?
--------------------------------------------------------------------------------
SOURCE: And interlacing his fingers, palms downwards, he stretched them and the joints cracked.
TARGET: E incrociate le dita le une nelle altre, con le palme all’ingiù, Aleksej Aleksandrovic le stiracchiò e le dita scricchiolarono nelle giunture.
PREDICTED: E , dopo la sua cosa , e , e , si , e si .


Processing epoch 05: 100%|██████████| 1819/1819 [17:04<00:00,  1.78it/s, loss=6.033]


--------------------------------------------------------------------------------
SOURCE: 'But one can't talk to Kitty about it!
TARGET: — Con Kitty non se ne può parlare!
PREDICTED: — Ma non è nulla ? — chiese .
--------------------------------------------------------------------------------
SOURCE: 'No.'
TARGET: — No.
PREDICTED: — Sì , signore .


Processing epoch 06: 100%|██████████| 1819/1819 [17:04<00:00,  1.78it/s, loss=5.352]


--------------------------------------------------------------------------------
SOURCE: But when I had done this, I was unable to stir it up again, or to get under it, much less to move it forward towards the water; so I was forced to give it over; and yet, though I gave over the hopes of the boat, my desire to venture over for the main increased, rather than decreased, as the means for it seemed impossible.
TARGET: Ma raggiunta questa meta, mi trovai nuovamente inabile a moverla, a mettermici sotto, tanto più poi a spingerla in acqua, onde finalmente fui costretto ad abbandonar la mia impresa. Pure, anche perdute tutte le speranze ch’io avea riposte nella scialuppa, cresceva in me, anzichè diminuire, il desiderio d’avventurarmi verso la terra comparsami innanzi, e crescea con tanta maggior forza quanto più impossibile ne apparivano i mezzi.
PREDICTED: Ma io mi a me , e a me , e a a , e a , e a a , e a , e , a , che non poteva essere più più più , e che non poteva , e non mi , e non m

Processing epoch 07: 100%|██████████| 1819/1819 [17:04<00:00,  1.77it/s, loss=5.374]


--------------------------------------------------------------------------------
SOURCE: Venus had risen above the branch and the car of the Great Bear as well as its shafts showed clearly against the dark blue sky, but he still waited.
TARGET: — Non è ora? — chiese Stepan Arkad’ic.
PREDICTED: Il signor Rochester era un po ’ di riposo , e , dopo aver fatto un ’ altra volta , che si , e si .
--------------------------------------------------------------------------------
SOURCE: Koznyshev when he got home and went again over all his reasons, came to the conclusion that at first he had judged wrongly.
TARGET: Ritornando a casa ed esaminando tutti gli argomenti, Sergej Ivanovic scoprì che non aveva ragionato in modo giusto.
PREDICTED: Levin , che aveva detto , era stato più più , e non poteva dire che non poteva fare .


Processing epoch 08: 100%|██████████| 1819/1819 [17:04<00:00,  1.77it/s, loss=5.310]


--------------------------------------------------------------------------------
SOURCE: At the station of a big town the Volunteers were again greeted with songs and cheers; again women and men turned up with collecting-boxes, the provincial ladies presented nosegays and accompanied the Volunteers to the refreshment-bar; but all this was far feebler and weaker than in Moscow.
TARGET: A una grande stazione di una città, di nuovo canti e grida accolsero i volontari, apparvero di nuovo raccoglitrici e raccoglitori di offerte con le cassette, e le signore del capoluogo del governatorato offrirono fasci di fiori ai volontari e li seguirono al ristorante; ma tutto questo era in tono molto più debole e in proporzioni minori che non a Mosca.
PREDICTED: Dopo un ’ altra volta , che era stato stato stato stato , e , in un momento , in cui , in quel momento , si era stato un ’ altra volta , e , come , per un attimo , si era un ’ altra volta , e che non si era ancora più più più più più più più pi

Processing epoch 09: 100%|██████████| 1819/1819 [17:04<00:00,  1.78it/s, loss=5.501]


--------------------------------------------------------------------------------
SOURCE: The boulevard and children.
TARGET: Il viale e i bambini.
PREDICTED: Il signor Rochester era molto .
--------------------------------------------------------------------------------
SOURCE: So Alice began telling them her adventures from the time when she first saw the White Rabbit. She was a little nervous about it just at first, the two creatures got so close to her, one on each side, and opened their eyes and mouths so very wide, but she gained courage as she went on.
TARGET: Così Alice cominciò a raccontare i suoi casi, dal momento dell'incontro col Coniglio bianco; ma tosto si cominciò a sentire un po' a disagio, chè le due bestie le si stringevano da un lato e l'altro, spalancando gli occhi e le bocche; ma la bambina poco dopo riprese coraggio.
PREDICTED: Il signor Rochester si alzò e si mise a guardare il suo volto , e poi , dopo aver guardato la porta , si alzò a guardare il cappello , e po

Processing epoch 10: 100%|██████████| 1819/1819 [17:04<00:00,  1.78it/s, loss=4.592]


--------------------------------------------------------------------------------
SOURCE: Anna, who had forgotten her agitation while she was working, stood at a table in the sitting-room packing her hand-bag when Annushka drew her attention to the noise of approaching carriage wheels.
TARGET: Anna, che per il lavoro dei preparativi aveva abbandonato l’interna agitazione, approntava la sua sacca da viaggio stando in piedi accanto alla tavola nello studio, quando Annuška ne attirò l’attenzione verso un rumore di vettura che s’avvicinava.
PREDICTED: Anna , che aveva visto il viso , si alzò , e , il cappello , si alzò , si alzò , e , , si mise a guardare il cappello , si alzò a guardare il cappello .
--------------------------------------------------------------------------------
SOURCE: No: stillness returned: each murmur and movement ceased gradually, and in about an hour Thornfield Hall was again as hushed as a desert. It seemed that sleep and night had resumed their empire.
TARGET: La 

Processing epoch 11: 100%|██████████| 1819/1819 [17:04<00:00,  1.78it/s, loss=5.435]


--------------------------------------------------------------------------------
SOURCE: Turning now to the opposite characters of Commodus, Severus, Antoninus Caracalla, and Maximinus, you will find them all cruel and rapacious-men who, to satisfy their soldiers, did not hesitate to commit every kind of iniquity against the people; and all, except Severus, came to a bad end; but in Severus there was so much valour that, keeping the soldiers friendly, although the people were oppressed by him, he reigned successfully; for his valour made him so much admired in the sight of the soldiers and people that the latter were kept in a way astonished and awed and the former respectful and satisfied.
TARGET: Discorrendo ora, per opposito, le qualità di Commodo, di Severo, Antonino Caracalla e Massimino, li troverrete crudelissimi e rapacissimi; li quali, per satisfare a' soldati, non perdonorono ad alcuna qualità di iniuria che ne' populi si potessi commettere; e tutti, eccetto Severo, ebbono tr

Processing epoch 12: 100%|██████████| 1819/1819 [17:04<00:00,  1.78it/s, loss=5.126]


--------------------------------------------------------------------------------
SOURCE: Ryabinin looked at it and smiled, shaking his head.
TARGET: Rjabinin, guardando verso questa, scosse la testa con un sorriso.
PREDICTED: La porta e la baciò con la mano .
--------------------------------------------------------------------------------
SOURCE: At this point, Mrs. Poppets knocked at the door to know if we were ready for supper.
TARGET: A questo punto picchiò all’uscio la signora Poppets per sapere se non volessimo andare a cena.
PREDICTED: E , per quanto ci , si , si a guardare .


Processing epoch 13: 100%|██████████| 1819/1819 [17:04<00:00,  1.78it/s, loss=4.268]


--------------------------------------------------------------------------------
SOURCE: Quickly!'
TARGET: Presto.
PREDICTED: !
--------------------------------------------------------------------------------
SOURCE: Probably at Oblonsky's indication, he looked round to where Koznyshev and the Princess were standing and silently raised his hat.
TARGET: Probabilmente, per indicazione di Oblonskij, egli si voltò a guardare dalla parte dove stavano la principessa e Sergej Ivanovic, e sollevò il cappello in silenzio.
PREDICTED: E Stepan Arkad ’ ic , che aveva visto il tè , si alzò e si alzò e si mise a guardare il capo .


Processing epoch 14: 100%|██████████| 1819/1819 [17:04<00:00,  1.78it/s, loss=4.808]


--------------------------------------------------------------------------------
SOURCE: THOUGH VRONSKY'S WHOLE INNER LIFE WAS ABSORBED by his passion, his external life ran unalterably and inevitably along its former customary rails of social and regimental connections and interests.
TARGET: Sebbene la vita intima di Vronskij fosse tutta piena della sua passione, la sua vita esteriore si svolgeva immutata e immutabile sull’abituale carreggiata di prima, tra i rapporti e gli interessi del gran mondo e del reggimento.
PREDICTED: La sua relazione era stata stata stata stata stata stata stata stata eccitata , e la sua attività di famiglia , la , la e la di tutti i suoi affari .
--------------------------------------------------------------------------------
SOURCE: Well, what's to be done!
TARGET: Ma che fare!
PREDICTED: E che cosa è accaduto !


Processing epoch 15: 100%|██████████| 1819/1819 [17:04<00:00,  1.78it/s, loss=4.664]


--------------------------------------------------------------------------------
SOURCE: "They have no mother?"
TARGET: — Non hanno madre?
PREDICTED: — È vero ?
--------------------------------------------------------------------------------
SOURCE: An austere patriot's passion for his fatherland!
TARGET: Passione austera di un patriotta per la patria!
PREDICTED: Il signor Rochester , che ha dato la sua opinione .


Processing epoch 16: 100%|██████████| 1819/1819 [17:02<00:00,  1.78it/s, loss=4.662]


--------------------------------------------------------------------------------
SOURCE: I am singled out and separated, But I am singled out, too, from as it were, from all the world, all the ship’s crew, to be spared to be miserable. from death; and He that miraculously saved me from death can deliver me from this condition.
TARGET: Io solo forse tra tutti gli uomini sono stato scelto a menare una vita di una miseria senza pari. Ma così solo io sono stato scelto tra tutti quelli del naviglio a scampare dalla morte; e colui che dalla morte mi ha salvato in modo di miracolo mi può liberare dallo stato in cui sono.
PREDICTED: Io sono felice e , e non mi più di quanto fosse stato , come se fossi stato , se non mi fossi reso un ’ altra volta , e che non avrei avuto altro , e che non avrei potuto .
--------------------------------------------------------------------------------
SOURCE: 'What? From Nilsson?' asked Betsy, quite horrified, though she could not have distinguished Nilsson's voi

Processing epoch 17: 100%|██████████| 1819/1819 [17:02<00:00,  1.78it/s, loss=4.121]


--------------------------------------------------------------------------------
SOURCE: "These were vile discoveries; but except for the treachery of concealment, I should have made them no subject of reproach to my wife, even when I found her nature wholly alien to mine, her tastes obnoxious to me, her cast of mind common, low, narrow, and singularly incapable of being led to anything higher, expanded to anything larger--
TARGET: — Furono scoperte odiose quelle che feci; ero dolente che mi avessero tradito celandomi la verità; ma senza la parte che vi aveva presa mia moglie, non avrei mai pensato a rimproverarle la sventura della sua famiglia, neppure quando mi accorsi che la sua indole era completamente diversa dalla mia, e che i suoi gusti non mi piacevano. Aveva uno spirito volgare, basso, limitato e incapace di capire le cose nobili ed elevate.
PREDICTED: — Allora , — disse , — ma non ho mai veduto il mio carattere , ma non potevo appartenere a me , ma non potevo sopportare il mi

Processing epoch 18: 100%|██████████| 1819/1819 [17:01<00:00,  1.78it/s, loss=4.261]


--------------------------------------------------------------------------------
SOURCE: We passed the bridge, and soon after that I asked if she saw the lock.
TARGET: Passammo il ponte, e subito dopo le chiesi se vedesse la chiusa.
PREDICTED: la strada e feci un salto davanti a me .
--------------------------------------------------------------------------------
SOURCE: Adele flew to the window. I followed, taking care to stand on one side, so that, screened by the curtain, I could see without being seen.
TARGET: Andai io pure alla finestra nascondendomi dietro le tende per vedere senza esser veduta.
PREDICTED: Bessie si sedeva davanti a me , e io fui alzata , perché udii colazione , mi alzai e mi figuro che ero entrato .


Processing epoch 19: 100%|██████████| 1819/1819 [17:01<00:00,  1.78it/s, loss=4.439]


--------------------------------------------------------------------------------
SOURCE: "Formerly," I answered, "because you did not love me; now, I reply, because you almost hate me.
TARGET: — Altra volta — dissi, — era perché non mi amavate; ora è perché mi aborrite quasi.
PREDICTED: — No , — risposi , — non vi ho mai pensato di voi , ma non voglio sopportare la mia volontà .
--------------------------------------------------------------------------------
SOURCE: The water revived his father more than all the rum or spirits I had given him, for he was fainting with thirst.
TARGET: Quando mi fu vicino mi consegnò frettolosamente queste, e recò l’acqua a suo padre; il quale poichè ne ebbe bevuto un poco, ne fu ristorato più che dal rum, che io gli avea donato; che il povero selvaggio moriva della sete.
PREDICTED: La tempesta era grande , ma io , siccome la mia navicella , mi rispose che fosse un grosso mistero .
