In [1]:
import time
import spacy
import datasets
import evaluate
import numpy as np
import torch
from torch import nn, Tensor
from tqdm import tqdm
import torch.nn.functional as F
from collections import Counter
from spacy.language import Language
from typing import List, Dict, Union, Tuple, Optional
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


Build and train a neural translation model based on an encoder-decoder LSTM architecture with attention.
The model is trained on the `Multi30k` German to Englsih translations dataset.

### Data processing

In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    # for Apple chips
    device = torch.device("mps")
else:
    device = torch.device("cpu")

print(device)

mps


In [3]:
dataset = datasets.load_dataset("bentrevett/multi30k")

In [4]:
train_data, valid_data, test_data = (
    dataset["train"],
    dataset["validation"],
    dataset["test"],
)

In [5]:
train_data[0]

{'en': 'Two young, White males are outside near many bushes.',
 'de': 'Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.'}

In [6]:
# !python -m spacy download en_core_web_sm
# !python -m spacy download de_core_news_sm

In [7]:
en_nlp = spacy.load("en_core_web_sm")
de_nlp = spacy.load("de_core_news_sm")

In [9]:
string = "We're gonna go swimming"

[token.text for token in en_nlp.tokenizer(string)]

['We', "'re", 'gon', 'na', 'go', 'swimming']

In [10]:
def tokenize_example(example: Dict[str, str],
                     en_nlp: Language,
                     de_nlp: Language,
                     max_length: int,
                     lower: bool,
                     eos_token: str):
    en_tokens = [token.text for token in en_nlp.tokenizer(example["en"])][:max_length]
    de_tokens = [token.text for token in de_nlp.tokenizer(example["de"])][:max_length]

    if lower:
        en_tokens = [token.lower() for token in en_tokens]
        de_tokens = [token.lower() for token in de_tokens]

    en_tokens = en_tokens + [eos_token]
    de_tokens = de_tokens + [eos_token]

    return {"en_tokens": en_tokens, "de_tokens": de_tokens}

max_length = 1000
lower = True
sos_token = "<BOS>"
eos_token = "<EOS>"

fn_kwargs = {
    "en_nlp": en_nlp,
    "de_nlp": de_nlp,
    "max_length": max_length,
    "lower": lower,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
valid_data = valid_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map: 100%|██████████| 29000/29000 [00:01<00:00, 16602.83 examples/s]
Map: 100%|██████████| 1014/1014 [00:00<00:00, 16363.58 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 16885.96 examples/s]


We build our own `Language` object that will adapt to a text corpus and store vocabulary.

In [11]:
class Language:
    def __init__(self, name: str = None, min_count: int = 2):
        self.name = name
        self._min_count = min_count
        
        self.index_to_word = {0: "<BOS>", 1: "<PAD>", 2: "<EOS>", 3: "<UNK>"}
        self.word_counts = Counter(self.index_to_word.values())
        
        self._last_index = list(self.index_to_word.keys())[-1]  # 3
        self._max_seq_length = 0

    def __repr__(self):
        return f"Language({self.name})"

    @property
    def max_seq_length(self) -> int:
        return self._max_seq_length

    @property
    def word_to_index(self) -> Dict[str, int]:
        return {word: index for index, word in self.index_to_word.items()}
    
    @property
    def words(self) -> List[str]:
        return list(self.index_to_word.values())
    
    @property
    def num_words(self) -> int:
        return len(self.words)
    
    def get_word(self, idx: int) -> str:
        return self.index_to_word.get(idx, "<UNK>")
    
    def get_idx(self, word: str) -> int:
        return self.word_to_index.get(word, self.word_to_index["<UNK>"])
    
    def string(self, idxs: Union[List[int], Tensor]) -> str:
        if isinstance(idxs, Tensor):
            idxs = idxs.cpu().detach().numpy().squeeze()
        
        return " ".join([self.get_word(idx) for idx in idxs])
    
    def indices(self, sentence: Union[str, List[str]]) -> List[int]:
        if isinstance(sentence, str):
            sentence = sentence.split()
        
        return [self.get_idx(word) for word in sentence]
    
    def add_word(self, word: str) -> None:
        if word not in self.words:
            self.word_counts.update([word])

            if self.word_counts[word] > self._min_count:
                # only update frequent words
                self.index_to_word[self._last_index + 1] = word
                self._last_index += 1
    
    def add_sentence(self, sentence: List[str]) -> None:
        seq_length = len(sentence)
        if seq_length > self._max_seq_length:
            self._max_seq_length = seq_length
        
        for word in sentence:
            self.add_word(word)

In [12]:
train_data.num_rows

29000

In [13]:
min_word_counts = 2
tgt_lang = Language("English", min_count=min_word_counts)
src_lang = Language("German", min_count=min_word_counts)

NUM_EXAMPLES = train_data.num_rows

# build vocabs
for example in train_data.take(NUM_EXAMPLES):
    src_lang.add_sentence(example["de_tokens"])
    tgt_lang.add_sentence(example["en_tokens"])

print(f"Max seq length src: {src_lang.max_seq_length}")
print(f"Max seq length tgt: {tgt_lang.max_seq_length}")

Max seq length src: 45
Max seq length tgt: 42


In [14]:
print(f"Vocab size src: {src_lang.num_words}")
print(f"Vocab size tgt: {tgt_lang.num_words}")

Vocab size src: 5374
Vocab size tgt: 4556


In [15]:
# def numericalize_example(example, tgt_lang: Language, src_lang: Language):
#     en_ids = tgt_lang.indices(example["en_tokens"])
#     de_ids = src_lang.indices(example["de_tokens"])
#     return {"en_ids": en_ids, "de_ids": de_ids}

# fn_kwargs = {"tgt_lang": tgt_lang, "src_lang": src_lang}

# train_data = train_data.take(NUM_EXAMPLES).map(numericalize_example, fn_kwargs=fn_kwargs)
# valid_data = valid_data.map(numericalize_example, fn_kwargs=fn_kwargs)
# test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

# train_data.save_to_disk("train_data_de_en.dataset")
# valid_data.save_to_disk("valid_data_de_en.dataset")
# test_data.save_to_disk("test_data_de_en.dataset")

# either load from disk or uncomment code above and run
train_data = datasets.load_from_disk(f"datasets/train_data_de_en.dataset")
valid_data = datasets.load_from_disk(f"datasets/valid_data_de_en.dataset")
test_data = datasets.load_from_disk(f"datasets/test_data_de_en.dataset")

In [16]:
print(src_lang.string(train_data[2]["de_ids"]))
print(tgt_lang.string(train_data[2]["en_ids"]))

ein kleines mädchen klettert in ein <UNK> aus holz . <EOS>
a little girl climbing into a wooden playhouse . <EOS>


In [17]:
data_type = "torch"
format_columns = ["en_ids", "de_ids"]

train_data = train_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

valid_data = valid_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

test_data = test_data.with_format(
    type=data_type, columns=format_columns, output_all_columns=True
)

In [18]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_en_ids = [example["en_ids"] for example in batch]
        batch_de_ids = [example["de_ids"] for example in batch]
        batch_en_ids = nn.utils.rnn.pad_sequence(batch_en_ids, padding_value=pad_index)
        batch_de_ids = nn.utils.rnn.pad_sequence(batch_de_ids, padding_value=pad_index)
        batch = {
            "en_ids": batch_en_ids,
            "de_ids": batch_de_ids,
        }
        return batch

    return collate_fn

def get_data_loader(dataset, batch_size, pad_index, shuffle=False, pin_memory=True):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        collate_fn=collate_fn,
        shuffle=shuffle,
        pin_memory=pin_memory
    )
    return data_loader

batch_size = 128

train_data_loader = get_data_loader(train_data,
                                    batch_size,
                                    tgt_lang.word_to_index["<PAD>"],
                                    shuffle=True)

valid_data_loader = get_data_loader(valid_data, batch_size, tgt_lang.word_to_index["<PAD>"])
test_data_loader = get_data_loader(test_data, batch_size, tgt_lang.word_to_index["<PAD>"])

### Model

In [19]:
class Encoder(nn.Module):
    def __init__(self, embed_dim: int,
                 hidden_dim: int,
                 src_vocab_size: int,
                 num_layers: int = 1,
                 dropout: float = 0.5):
        super().__init__()
        self._num_layers = num_layers
        self._hidden_dim = hidden_dim

        self.embedding = nn.Embedding(src_vocab_size, embed_dim)
        self.rnn = nn.LSTM(embed_dim, hidden_dim, bidirectional=False, num_layers=num_layers)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, input: torch.Tensor):
        """
        Inputs
            input of size (batch_size, src_seq_length)
        Outputs
            output of size (batch_size, src_seq_length, hidden_dim * 1)
            hidden of size (batch_size, 1, (hidden_dim * 1))
            cell of size (batch_size, 1, (hidden_dim * 1))
        """
        # x.shape (batch_size, seq_length, embed_dim)
        x = self.embedding(input)
        x = self.dropout(x)
        
        # LSTM expects batch as second dimension
        # x.shape (seq_length, batch_size, embed_dim)
        x = x.transpose(1, 0)

        # output.shape (seq_length, batch_size, hidden_dim * 1)
        # hidden.shape (num_layers, batch_size, hidden_dim) [2 * num_layers if bi-directional LSTM]
        # cell.shape (num_layers, batch_size, hidden_dim) [2 * num_layers if bi-directional LSTM]
        output, (hidden, cell) = self.rnn(x)
        
        # output.shape (batch_size, seq_length, hidden_dim * 1)
        output = output.transpose(1, 0)

        # hidden.shape (num_layers, batch_size, hidden_dim * 1)
        # hidden = hidden.reshape(self._num_layers, batch_size, self._hidden_dim * 1)

        return output, hidden, cell

In [20]:
class Attention(nn.Module):
    def __init__(self, enc_hidden_dim: int, dec_hidden_dim: int):
        super().__init__()

        assert enc_hidden_dim == dec_hidden_dim, "Encoder and decoder must have the same hidden dim"

        self.fc_1 = nn.Linear(in_features=enc_hidden_dim + dec_hidden_dim, out_features=dec_hidden_dim)
        self.fc_2 = nn.Linear(in_features=dec_hidden_dim, out_features=1, bias=False)
    
    def forward(self, hidden: Tensor, encoder_outputs: Tensor):
        """
        Inputs
            hidden: (num_layers, batch_size, dec_hidden_dim) [query]
            encoder_ouputs: (batch_size, seq_length, enc_hidden_dim) [keys, values]
        Outputs
            weighted attention (batch_size, 1, dec_hidden_dim)
        """
        seq_lenght = encoder_outputs.size(1)

        # hidden.shape (batch_size, num_layers, dec_hidden_dim)
        hidden = hidden.transpose(1, 0)
        # hidden.shape (batch_size, 1, dec_hidden_dim)
        hidden = hidden.sum(axis=1).unsqueeze(1)
        # hidden.shape (batch_size, seq_lenght, dec_hidden_dim)
        hidden = hidden.repeat(1, seq_lenght, 1)

        # input.shape (batch_size, seq_lenght, dec_hidden_dim + enc_hidden_dim)
        input = torch.cat([hidden, encoder_outputs], axis=2)

        # energy.shape (batch_size, seq_length, dec_hidden_dim)
        energy = self.fc_1(input).relu()

        # attention.shape (batch_size, seq_length, 1)
        attention = self.fc_2(energy)
        # attention.shape (batch_size, 1, seq_length)
        attention = attention.transpose(2, 1).softmax(axis=2)

        # weighted.shape (batch_size, 1, enc_hidden_dim)
        weighted = torch.bmm(attention, encoder_outputs)

        return weighted

In [21]:
class Decoder(nn.Module):
    def __init__(self,
                 embed_dim: int,
                 hidden_dim: int,
                 tgt_vocab_size: int,
                 max_seq_legth: int = 50,
                 num_layers: int = 1,
                 dropout: float = 0.5):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.max_seq_legth = max_seq_legth

        self.embedding = nn.Embedding(tgt_vocab_size, embed_dim)
        self.rnn = nn.LSTM(input_size=embed_dim + hidden_dim, hidden_size=hidden_dim, num_layers=num_layers)
        self.fc = nn.Linear(in_features=hidden_dim, out_features=tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.attention = Attention(enc_hidden_dim=hidden_dim, dec_hidden_dim=hidden_dim)
        # self.attention = ScaledDotProductAttention(hidden_dim=hidden_dim, seq_length=50)
    
    def forward(self,
                input: Tensor,
                encoder_outputs: Tensor,
                encoder_hidden: Tensor,
                encoder_cell: Tensor,
                tgt: Tensor,
                teacher_forcing: float = 0.5) -> Tensor:
        """
        Inputs
            input (batch_size, 1)
            encoder_outputs (batch_size, seq_length, hidden_dim * 1)
            encoder_hidden state (num_layers, batch_size, hidden_dim)
            encoder_cell state (num_layers, batch_size, hidden_dim)
            tgt of shape (batch_size, tgt_seq_length)
            teacher_forcing: teacher forcing ratio
        Outputs
            log-probabilities of shape (batch_size, tgt_seq_length, tgt_vocab_size)
        """
        decoder_outputs = []  # type: List[torch.Tensor]

        hidden = encoder_hidden
        cell = encoder_cell

        # during training we generate same number of words as target length
        # during inference (tgt=None) we will generate max_seq_legth words
        target_length = tgt.size(1) if tgt is not None else self.max_seq_legth

        for t in range(target_length):
            # decoder_output.shape (1, batch_size, tgt_vocab_size)
            # hidden.shape (num_layers, batch_size, hidden_dim)
            # cell.shape (num_layers, batch_size, hidden_dim)
            decoder_output, hidden, cell = self._one_forward_step(input, encoder_outputs, hidden, cell)
            
            # will be shape (tgt_seq_length, 1, batch_size, tgt_vocab_size)
            decoder_outputs.append(decoder_output)

            # teacher forcing
            if (tgt is not None) and (np.random.uniform() < teacher_forcing):
                # decoder_input.shape (batch_size, 1)
                input = tgt[:, t].unsqueeze(1)
            else:
                # use decoder's own predictions
                # pred.shape (batch_size, 1, tgt_vocab_size)
                pred = decoder_output.log_softmax(dim=-1).transpose(1, 0)
                _, input = pred.max(dim=-1)
                # _, pred_top2 = pred.topk(2, axis=-1)
                
             
        
        # decoder_outputs.shape (tgt_seq_length, 1, batch_size, hidden_dim)
        decoder_outputs = torch.stack(decoder_outputs, dim=0)
        # decoder_outputs.shape (tgt_seq_length, batch_size, hidden_dim)
        decoder_outputs = decoder_outputs.squeeze(1)

        # decoder_outputs.shape (batch_size, tgt_seq_length, hidden_dim)
        decoder_outputs = decoder_outputs.transpose(1, 0)

        # decoder_outputs.shape (batch_size, tgt_seq_length, tgt_vocab_size)
        # decoder_outputs = self.fc(decoder_outputs)

        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)

        return decoder_outputs

    def _one_forward_step(self,
                          input: Tensor,
                          encoder_outputs: Tensor,
                          hidden: Tensor,
                          cell: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
        # input.shape (batch_size, 1, embed_dim)
        input = self.dropout(self.embedding(input))
        
        # attn_vector.shape (batch_size, 1, hidden_dim)
        attn_vector = self.attention(hidden, encoder_outputs)

        # decoder_rnn_input.shape (batch_size, 1, embed_dim + hidden_dim)
        decoder_rnn_input = torch.cat([input, attn_vector], dim=-1)
        # decoder_rnn_input.shape (1, batch_size, embed_dim + hidden_dim)
        decoder_rnn_input = decoder_rnn_input.transpose(1, 0)

        decoder_rnn_input = F.relu(decoder_rnn_input)

        # decoder_output.shape (1, batch_size, hidden_dim)
        # hidden.shape (num_layers, batch_size, hidden_dim)
        # cell.shape (num_layers, batch_size, hidden_dim)
        decoder_output, (hidden, cell) = self.rnn.forward(decoder_rnn_input, (hidden, cell))
        
        # decoder_output.shape (1, batch_size, tgt_vocat_size)
        decoder_output = self.fc(decoder_output)

        return decoder_output, hidden, cell

In [22]:
class TranslatorModel(nn.Module):
    def __init__(self, encoder: Encoder, decoder: Decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, start_token: Tensor, src: Tensor, tgt: Tensor):
        """
        Inputs
            start_token (batch_size, 1)
            src (batch_size, src_seq_length)
            tgt (batch_size, tgt_seq_length)
        Outputs
            log-probabilities of shape (batch_size, tgt_seq_length, tgt_vocab_size)
        """
        encoder_outputs, encoder_hidden, encoder_cell = self.encoder(src)
        decoder_outputs = self.decoder(start_token, encoder_outputs, encoder_hidden, encoder_cell, tgt)
        return decoder_outputs

### Training

In [23]:
def init_weights(m: nn.Module):
    for name, param in m.named_parameters():
        if "bias" in name:
            nn.init.zeros_(param.data)
        else:
            nn.init.uniform_(param.data, -0.1, 0.1)

def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

loss_function = nn.NLLLoss(ignore_index=tgt_lang.get_idx("<PAD>"))

dropout_ratio = 0.5
hidden_dim = 512
embed_dim = 256
num_layers = 1

encoder = Encoder(embed_dim=embed_dim,
                  hidden_dim=hidden_dim,
                  src_vocab_size=src_lang.num_words,
                  num_layers=num_layers,
                  dropout=dropout_ratio).to(device)

decoder = Decoder(embed_dim=embed_dim,
                  hidden_dim=hidden_dim,
                  tgt_vocab_size=tgt_lang.num_words,
                  max_seq_legth=tgt_lang.max_seq_length + 1,
                  num_layers=num_layers,
                  dropout=dropout_ratio).to(device)

translator = TranslatorModel(encoder, decoder).to(device)

translator.apply(init_weights)
print(f"Model num parameters: {count_parameters(translator):,}")

optimizer = torch.optim.Adam(translator.parameters())

Model num parameters: 9,607,116


In [24]:
def run_batch(model: nn.Module,
              loss_function: nn.NLLLoss,
              batch: Dict[str, Tensor],
              device: torch.device) -> float:
    # src.shape (batch_size, seq_length)
    src = batch["de_ids"].to(device).transpose(1, 0)  # type: Tensor
    tgt = batch["en_ids"].to(device).transpose(1, 0)  # type: Tensor
    
    start_token = torch.zeros((src.size(0), 1), dtype=torch.long).fill_(tgt_lang.word_to_index["<BOS>"]).to(device)

    log_probs = model(start_token, src, tgt)  # type: Tensor
    log_probs = log_probs.reshape(-1, log_probs.size(-1))

    loss = loss_function(log_probs, tgt.reshape(-1).long())

    return loss

def train_one_epoch(model: nn.Module,
                    optimizer: torch.optim.Optimizer,
                    loss_function: nn.NLLLoss,
                    data_loader: torch.utils.data.DataLoader,
                    device: torch.device) -> float:
    model.train()
    
    losses = []
    for batch in tqdm(data_loader):
        optimizer.zero_grad()

        # src = batch["de_ids"].to(device).transpose(1, 0)  # type: torch.Tensor
        # tgt = batch["en_ids"].to(device).transpose(1, 0)  # type: torch.Tensor
        # start_token = torch.zeros((src.size(0), 1), dtype=torch.long).fill_(tgt_lang.word_to_index["<BOS>"]).to(device)

        # outputs = model(start_token, src, tgt)  # type: torch.Tensor

        # probabilities = outputs.reshape(-1, outputs.size(-1))

        # loss = loss_function(probabilities, tgt.reshape(-1).long())
        loss = run_batch(model=model, loss_function=loss_function, batch=batch, device=device)
        losses.append(loss.item())

        loss.backward()

        nn.utils.clip_grad_norm_(translator.parameters(), 1.0)

        optimizer.step()
    
    return sum(losses) / len(losses)

def translate_from_tensor(model: nn.Module, input: Tensor, tgt_lang: Language, tgt: Optional[Tensor]):
    start_token = torch.zeros((1, 1), dtype=torch.long).fill_(tgt_lang.word_to_index["<BOS>"]).to(device)

    # outputs.shape (batch_size, tgt_seq_length, tgt_vocab_size)
    log_probs = model(start_token, input, tgt=tgt)  # type: Tensor

    # pred_top2.shape (batch_size, tgt_seq_length, 2)
    _, pred_top2 = log_probs.topk(2, dim=-1)
    
    # pred_top2.shape (tgt_seq_length, 2)
    pred_top2 = pred_top2.squeeze(0)  # because batch_size=1 here

    # unpack first 2 top predictions
    first_pred, second_pred = pred_top2[:, 0].unsqueeze(1), pred_top2[:, 1].unsqueeze(1)
    
    # in case first top prediction is UNK use second top prediction
    unk_idx = tgt_lang.word_to_index["<UNK>"]
    indices = torch.where(first_pred == unk_idx, second_pred, first_pred)
    
    indices = indices.squeeze().tolist()

    sentence = tgt_lang.string(indices)

    return sentence

def print_sentences(data: datasets.Dataset, idx: int, model: nn.Module, src_lang: Language, tgt_lang: Language, device: torch.device):
    data_eval_src = data[idx]["de_ids"].to(device)
    data_eval_tgt = data[idx]["en_ids"].to(device)
    sentence_src = src_lang.string(data_eval_src)
    sentence_tgt = tgt_lang.string(data_eval_tgt)
    sentence_evaluated = translate_from_tensor(model, data_eval_src.unsqueeze(0), tgt_lang, data_eval_tgt.unsqueeze(0))

    print(f"SOURCE: {sentence_src}")
    print(f"TARGET: {sentence_tgt}")
    print(f"MODEL: {sentence_evaluated}")

def translate_from_string(sentence: str, model: nn.Module, src_lang: Language, tgt_lang: Language, device: torch.device) -> str:
    sentence_idxs = src_lang.indices(sentence)
    sentence_idxs = np.array(sentence_idxs)[np.newaxis, :]
    sentence_idxs = torch.from_numpy(sentence_idxs).to(device)
    translation_raw = translate_from_tensor(model, sentence_idxs, tgt_lang, tgt=None)
    translation_trimmed = translation_raw.split(" <EOS>")[0]
    
    return translation_trimmed

def evaluate_model(model: nn.Module,
                   data_loader: torch.utils.data.DataLoader,
                   loss_function: nn.NLLLoss,
                   device: torch.device):
    model.eval()
    
    losses = []
    with torch.no_grad():
        for _, batch in enumerate(data_loader):
            loss = run_batch(model=model, loss_function=loss_function, batch=batch, device=device)
            losses.append(loss.item())
    
    return sum(losses) / len(losses)

In [25]:
# print untrained model translations
print_sentences(data=train_data, idx=42, model=translator, src_lang=src_lang, tgt_lang=tgt_lang, device=device)

SOURCE: ein mann geht an einem silbernen fahrzeug vorbei . <EOS>
TARGET: a man walks by a silver vehicle . <EOS>
MODEL: stand observing scoops people five kettle grins silk fix


Proceed with training or jump right to `Evaluate` section to load pre-trained model weights.
Training was performed on an Nvidia RTX4090 GPU and took around 6 minutes for 20 epochs.

In [None]:
NUM_EPOCHS = 20

translator.train()

model_name = f"translator_rnn_{num_layers}_layers"
best_val_loss = float("inf")
train_losses, valid_losses = [], []
for epoch in range(NUM_EPOCHS):
    time_start = time.time()
    epoch_loss = train_one_epoch(translator, optimizer,
                                 loss_function, train_data_loader, device)
    
    time_passed_seconds = time.time() - time_start
    
    train_losses.append(epoch_loss)
    
    valid_loss = evaluate_model(model=translator, data_loader=valid_data_loader, loss_function=loss_function, device=device)
    valid_losses.append(valid_loss)
    
    if valid_loss < best_val_loss:
        # save best validaiton loss model
        best_val_loss = valid_loss
        print("Saving model state...")
        torch.save(translator.state_dict(), f"{model_name}_bestval.pt")
    
    # save model
    torch.save(translator.state_dict(), f"{model_name}.pt")

    print(f"Epoch: {epoch + 1}, elapsed: {time_passed_seconds:.0f} sec, train loss: {epoch_loss:.4f}, validation loss: {valid_loss:.4f}")

    if (epoch + 1) % 10 == 0:
        random_eval_idx = int(np.random.choice(list(range(NUM_EXAMPLES))))
        print_sentences(data=train_data, idx=random_eval_idx, model=translator, src_lang=src_lang, tgt_lang=tgt_lang, device=device)
    
    print("-" * 100)

In [None]:
plt.plot(train_losses, color="blue", label="Train")
plt.plot(valid_losses, color="red", label="Validation")
plt.legend()
plt.grid()
plt.title("Average loss per epoch")

### Evaluate

In [26]:
translator.load_state_dict(torch.load(f"models/translator_rnn_{num_layers}_layers_bestval.pt", map_location=device))
translator.eval()

  translator.load_state_dict(torch.load(f"models/translator_rnn_{num_layers}_layers_bestval.pt", map_location=device))


TranslatorModel(
  (encoder): Encoder(
    (embedding): Embedding(5374, 256)
    (rnn): LSTM(256, 512)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(4556, 256)
    (rnn): LSTM(768, 512)
    (fc): Linear(in_features=512, out_features=4556, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (attention): Attention(
      (fc_1): Linear(in_features=1024, out_features=512, bias=True)
      (fc_2): Linear(in_features=512, out_features=1, bias=False)
    )
  )
)

In [35]:
random_eval_idx = int(np.random.choice(list(range(NUM_EXAMPLES))))
print_sentences(data=train_data, idx=random_eval_idx, model=translator, src_lang=src_lang, tgt_lang=tgt_lang, device=device)

SOURCE: ein kind vor seinem eigenen spiegelbild , das sich zur kamera dreht und lächelt . <EOS>
TARGET: a child in front of his own reflection turning towards the camera and smiling . <EOS>
MODEL: a child in front of his reflection reflection , the the camera . smiling . <EOS>


In [36]:
# idxs = [42, 422, 10, 7, 999]
idxs = [np.random.randint(low=0, high=NUM_EXAMPLES) for _ in range(5)]

for idx in idxs:
    print_sentences(data=train_data, idx=idx, model=translator, src_lang=src_lang, tgt_lang=tgt_lang, device=device)
    print("-" * 100)

SOURCE: ein mann sitzt auf einer <UNK> und arbeitet an einer maschine . <EOS>
TARGET: man sitting atop oil rig on machine working . <EOS>
MODEL: a sitting on a stoop of a working on a
----------------------------------------------------------------------------------------------------
SOURCE: drei frauen stehen bei einem strand im wasser . <EOS>
TARGET: three women standing in the water beside a beach . <EOS>
MODEL: three women standing in the water on a beach . <EOS>
----------------------------------------------------------------------------------------------------
SOURCE: das rennauto und der fahrer von <UNK> fahren eine rennstrecke entlang . <EOS>
TARGET: the <UNK> racing car and driver driving down a racetrack . <EOS>
MODEL: the rally car and and the driver is a racetrack . <EOS>
----------------------------------------------------------------------------------------------------
SOURCE: eine junge blonde frau geht durch eine straße mit einer schwarzen handtasche . <EOS>
TARGET: a y

In [37]:
bleu = evaluate.load("bleu")

def get_tokenizer_fn(nlp: Language, lower: bool):
    def tokenizer_fn(s):
        tokens = [token.text for token in nlp.tokenizer(s)]
        if lower:
            tokens = [token.lower() for token in tokens]
        return tokens

    return tokenizer_fn

tokenizer_fn = get_tokenizer_fn(en_nlp, lower)

In [38]:
# compute BLEU metric on test data
predictions, references = [], []
for idx in tqdm(range(test_data.num_rows)):
    data_eval_src = test_data[idx]["de_ids"].to(device)
    sentence_evaluated = translate_from_tensor(translator,
                                    data_eval_src.unsqueeze(0),
                                    tgt_lang,
                                    tgt=None).split("<EOS>")[0]


    predictions.append(sentence_evaluated)
    references.append(test_data[idx]["en"])

bleu.compute(predictions=predictions, references=references, tokenizer=tokenizer_fn)

100%|██████████| 1000/1000 [00:30<00:00, 32.62it/s]


{'bleu': 0.3086418885670576,
 'precisions': [0.6574118397957556,
  0.39413906710594765,
  0.25090184165559143,
  0.16498846234529055],
 'brevity_penalty': 0.9590555442023709,
 'length_ratio': 0.9598713432378618,
 'translation_length': 12534,
 'reference_length': 13058}