## Neural Machine Traslation using Encoder-Decoder Architecture

The aim of this notebook is to implement a Neural Machine Traslation (NMT) using basic [encoder-decoder](https://proceedings.neurips.cc/paper/2014/file/a14ac55a4f27472c5d894ec1c3c743d2-Paper.pdf) approach without using attention mechanism




In [1]:
%%capture
# Donload de dataset
# !mkdir MNT-Dataset
# !wget -P MNT-Dataset/ https://www.manythings.org/anki/spa-eng.zip
# !unzip MNT-Dataset/spa-eng.zip -d MNT-Dataset/

In [2]:
# import libaries
import torch
import spacy

import numpy as np
import pandas as pd
from torch import nn
import multiprocessing as mp

from typing import List
from tqdm.notebook import tqdm
from torch.nn.utils.rnn import pad_sequence
from torchmetrics.functional import bleu_score
from sklearn.model_selection import train_test_split



In [3]:
# load dataset
dataset = pd.read_table("MNT-Dataset/spa.txt", header=None, names=["english", "spanish", "ref"]).drop(labels=["ref"], axis=1)
print(dataset.shape)
dataset.english = dataset.english.str.lower()
dataset.spanish = dataset.spanish.str.lower()
dataset.head()

(139636, 2)


Unnamed: 0,english,spanish
0,go.,ve.
1,go.,vete.
2,go.,vaya.
3,go.,váyase.
4,hi.,hola.


In [4]:
# define a tokenizer using spacy
class Tokenizer:

    def __init__(self, language: str = None) -> None:
        """
        A simple tokenizer class that uses Spacy to tokenize text.

        Parameteres:
        ------------
            language (str, optional): The language of the text to be tokenized. Defaults to None.
                Supported languages are 'sp' for Spanish and 'en' for English.
        """

        if language == "sp":
            self.nlp = spacy.load("es_core_news_sm")  # load the Spanish Spacy model
        elif language == "en":
            self.nlp = spacy.load("en_core_web_sm")  # load the English Spacy model

    def __call__(self, text: str) -> str:
        """
        Tokenizes a given text using the Spacy tokenizer.

        Args:
            text (str): The text to be tokenized.

        Returns:
            A list of strings representing the tokens in the text.
        """

        return [w.text for w in self.nlp.tokenizer(text)]  # return the text tokens

In [5]:
# Now we a language class that represents a language and its vocabulary
class Lang:
    def __init__(self, name:str, language:str="sp"):
        """
        A class for language preprocessing and encoding. It uses a tokenizer to split text into tokens, and encodes
        these tokens into integer values. It also provides methods to add sentences and words to the vocabulary, and to
        transform text into its encoded form.

        Parameters:
        -----------
        name : str
            A name for the language object.
        language : str, default='sp'
            The language of the text to process. Currently supported languages are 'sp' (Spanish) and 'en' (English).
        """
        
        self.name = name
        self.language = language
        self.word2index = {"<pad>": 0, "<start>": 1, "<end>": 2, "<unk>": 3}
        self.word2count = {}
        self.index2word = {0: "<pad>", 1: "<start>", 2: "<end>", 3: "<unk>"}
        self.n_words = 4  # Count SOS and EOS
        self.tokenizer = Tokenizer(language)

    def addSentence(self, sentence:str):
        """
        Add a sentence to the vocabulary.

        Parameters:
        -----------
        sentence : str
            The sentence to add.
        """
        
        for word in self.tokenizer(sentence):
            self.addWord(word)

    def addWord(self, word:str):
        """
        Add a word to the vocabulary.

        Parameters:
        -----------
        word : str
            The word to add.
        """
        
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

    def fit(self, dataset:List[str]):
        """
        Build the vocabulary from a dataset.

        Parameters:
        -----------
        dataset : list
            A list of sentences to add to the vocabulary.
        """
        
        for data in tqdm(dataset):
            self.addSentence(data)

    def transform(self, text:str, padding:bool=True):
        """
        Transform text into its encoded form.

        Parameters:
        -----------
        text : str
            The text to encode.
        padding : bool, default=True
            Whether to pad the sequence to the maximum sequence length.

        Returns:
        --------
        encoding : list
            A list of integers representing the encoded sequence.
        """

        tokens = self.tokenizer(text)
        if padding:
            tokens = ["<start>"] + tokens + ["<end>"]
            tokens = tokens

        encoding = [self.word2index[tk] if tk in self.word2index.keys() else 3 for tk in tokens]

        return encoding
    
    def inverse_transform(self, tokens:List):
        """
        Decodes the encoded sequence of integers using the vocabulary of the language.

        Parameters:
        -----------
            tokens: list
                The encoded sequence of integers to decode.

        Returns:
        --------
            str: The decoded sentence.
        """
        
        words = [self.index2word[tk] for tk in tokens]

        return " ".join(words)
    
    @staticmethod
    def right_padding_per_batch(batch: tuple):
        """
        Pads the sequence of tokens with 0s to match the sequence length per batch. 
        This method will be pass to the collate_fn argument of the Dataloader class.

        Parameters:
        -----------
            batch: tuple 
                The sequence of tokens to pad.

        Returns:
        --------
            tuple: The padded sequence of tokens in the batch.
        """
    
        en_text_bs, sp_text_bs = [], []

        for en_text, sp_text in batch:
            en_text_bs.append(en_text)
            sp_text_bs.append(sp_text)

        en_text_bs = pad_sequence(en_text_bs, padding_value=0)
        sp_text_bs = pad_sequence(sp_text_bs, padding_value=0)

        return en_text_bs, sp_text_bs

# Data loader

We define a custom data loader that output the token for the sentences in spanish and english

In [6]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataset:pd.DataFrame, sp_lang:Lang=None, en_lang:Lang=None):
        """
        A PyTorch custom dataset for language translation.

        Parameters:
        ----------
        dataset : DataFrame
            The dataset containing the English and Spanish sentences.
        sp_lang: Lang
            The language object for the Spanish language. Default None
        en_lang: Lang
            The language object for the English language. Default None
        """
    
        self.dataset = dataset

        if isinstance(sp_lang, Lang) and isinstance(en_lang, Lang):
            self.sp_lang = sp_lang
            self.en_lang = en_lang
            
        else:
            # Initialize language objects for Spanish and English
            self.sp_lang = Lang("sp", language="sp")
            self.sp_lang.fit(dataset.spanish)

            self.en_lang = Lang("en", language="en")
            self.en_lang.fit(dataset.english)

    def __len__(self):
        """
        Returns the number of samples in the dataset.

        Returns:
        -------
        int
            The number of samples in the dataset

        """
        
        return len(self.dataset)

    def __getitem__(self, idx):
        """
        Returns a sample from the dataset.

        Parameters:
        ----------
        idx : int
            The index of the sample to return.

        Returns:
        -------
        tuple of torch.Tensor
            The English sentence and the Spanish sentence as tensors.

        """
        
        # Get the Spanish and English sentences from the dataset
        sp_text = self.dataset.spanish.tolist()[idx]
        en_text = self.dataset.english.tolist()[idx]

        # Transform the Spanish and English sentences using the language objects
        sp_text = self.sp_lang.transform(sp_text)
        en_text = self.en_lang.transform(en_text)

        # Convert the transformed sentences to tensors
        sp_text = torch.Tensor(sp_text).long()
        en_text = torch.Tensor(en_text).long()

        return en_text, sp_text

In [7]:
# test the dataloader
ds_train = CustomDataset(dataset)

  0%|          | 0/139636 [00:00<?, ?it/s]



  0%|          | 0/139636 [00:00<?, ?it/s]

In [8]:
# get the spanish and English vocab size
sp_vocab = ds_train.sp_lang.n_words
en_vocab = ds_train.en_lang.n_words

# Encoder

As encoder we use a GRU layer that returns the hidden state

$$O_t, h_t = GRU(x_t, h_{t-1})$$


Here, $O_t$ is the output at time step $t$, $h_t$ is the hidden state at time step t, $x_t$ is the input at time step $t$, and $h_{t-1}$ is the hidden state from the previous time step.

In [9]:
class Encoder(nn.Module):
    def __init__(
        self, input_size, embedding_dim=100, n_layers=2, hidden_dim=10, dropout=0.5
    ):
        """
        Encode input sequences using a GRU-based neural network.

        Parameters
        ----------
        input_size : int
            The size of the input vocabulary.
        embedding_dim : int, optional (default=100)
            The dimension of the input word embeddings.
        n_layers : int, optional (default=2)
            The number of GRU layers.
        hidden_dim : int, optional (default=10)
            The dimension of the hidden state of the GRU.
        dropout : float, optional (default=0.5)
            The dropout probability to use in the GRU and embedding layers.
        """

        super().__init__()
        self.input_size = input_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        # Embedding layer
        # input shape: either [batch_size, seq_len] or [seq_len, batch_size]
        self.embeddings = nn.Embedding(input_size, embedding_dim)

        # GRU layers
        # input shape: [batch_size, seq_len, features] if batch_first=True
        # input shape: [seq_len, batch_size, features] if batch_first=False
        self.rnn = nn.GRU(
            embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=False
        )

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        """
        Compute the forward pass of the encoder on input `x`.

        Parameters
        ----------
        x : torch.Tensor
            The input tensor with shape (seq_len, batch_size).

        Returns
        -------
        torch.Tensor
            The output tensor with shape (seq_len, batch_size, num_directions * hidden_dim).
        torch.Tensor
            The hidden state tensor with shape (n_layers * n_directions, batch_size, hidden_dim).
        """
        # x shape: [seq_len, batch_size]

        # Embedding Layer
        # output shape: [seq_len, batch_size, emb_dim]
        x = self.dropout(self.embeddings(x))

        # GRU Layer
        # output(x) shape: [seq_len, batch_size, num_directions * hidden_dim]
        # output(hidden) shape: = [n_layers * n_directions, batch_size, hidden_dim]
        x, hidden = self.rnn(x)
        return x, hidden


In [10]:
# Test encoder
encoder = Encoder(100)

x = torch.tensor([[1, 3, 4], [4, 5, 6]], dtype=torch.long)
x, hidden = encoder(x.T)
print(x.shape)
print(hidden.shape)

torch.Size([3, 2, 10])
torch.Size([2, 2, 10])


# Decoder
The decoder (d) is responsible for generating the translated output sentence in the target language. In this notebook, we will be using a simple decoder architecture that consists of a single GRU layer followed by a multi-layer perceptron (MLP) that outputs logits. The decoder takes as input the previously generated word $y_{t-1}$, the previous hidden state $s_{t-1}$, and the context vector $c$, which is calculated by the encoder. The output of the decoder at each time step $t$ is a set of logits $logits_t$, which are used to compute the probability distribution over the target vocabulary. The decoder also updates its own hidden state $s_t$ at each time step, which is used to condition the generation of subsequent words in the output sequence.

The equations for the decoder can be written as follows:

$$logits_t, s_t = d(y_{t-1}, s_{t-1}, c)$$

where $y_{t-1}$ is the previously generated word, $s_{t-1}$ is the previous hidden state of the decoder, and $c$ is the context vector computed by the encoder. The output of the decoder at time step $t$ is represented by $logits_t$, and the updated hidden state is represented by $s_t$. The computation of the context vector is described in the encoder section.

In [13]:
class Decoder(nn.Module):
    def __init__(
        self,
        input_size: int,
        output_size: int,
        embedding_dim: int = 100,
        n_layers: int = 2,
        hidden_dim: int = 10,
        dropout: float = 0.5,
    ):
        """
        A class representing a decoder in a sequence-to-sequence model.

        Parameters:
        -----------
        input_size : int
            The size of the vocabulary of the input language.
        output_size : int
            The size of the vocabulary of the output language.
        embedding_dim : int
            The dimensionality of the embeddings for the input language. Default is 100.
        n_layers : int
            The number of layers in the GRU. Default is 2.
        hidden_dim : int
            The number of features in the GRU. Default is 10.
        dropout : float
            The probability of dropping out a neuron. Default is 0.5.
        """

        super().__init__()

        self.input_size = input_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        # Embedding layer
        # input shape: either [batch_size, seq_len] or [seq_len, batch_size]
        self.embeddings = nn.Embedding(input_size, embedding_dim)

        # GRU layers
        # input shape: [batch_size, seq_len, features] if batch_first=True
        # input shape: [seq_len, batch_size, features] if batch_first=False
        self.rnn = nn.GRU(
            embedding_dim + hidden_dim,
            hidden_dim,
            n_layers,
            batch_first=False,
            dropout=dropout,
        )

        # Fully conected layer
        self.fc = nn.Linear(hidden_dim, output_size)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, hidden, context):
        """
        Perform forward pass of the decoder.

        Parameters:
        -----------
        x : torch.Tensor
            The input tensor with shape [seq_len=1, batch_size].
        hidden : torch.Tensor
            The hidden state of the GRU with shape [n_layers * n_directions, batch_size, hid_dim].
        context : torch.Tensor
            The context vector with shape [1, batch_size, hid_dim].

        Returns:
        --------
        x : torch.Tensor
            The output tensor with shape [batch_size, output_size].
        hidden : torch.Tensor
            The updated hidden state of the GRU with shape [n_layers * n_directions, batch_size, hidden_dim].
        """

        # x shape: [seq_len=1, batch_size]
        # hidden shape: = [n_layers * n_directions, batch_size, hid_dim]
        # context shape: [1, batch_size, hid_dim]

        # Embeddings
        # output shape: [seq_len=1, batch_size, emb_dim]
        x = self.dropout(self.embeddings(x))

        # Concatenation
        # output shape: [seq_len, batch_size, emb_dim + hid_dim]
        x = torch.cat((x, context), dim=2)

        # GRU Layer
        # output(x) shape: [seq_len=1, batch_size, num_directions * hidden_dim]
        # output(hidden) shape: = [n_layers * n_directions, batch_size, hidden_dim]
        x, hidden = self.rnn(x, hidden)

        # Fully connected layer
        # output shape: [batch_size, output_size]
        x = self.fc(x)

        return x, hidden


### Test the decoder

Now we test that the decoder is working as we expect.

In [17]:
# test Decoder class
bs = 2
hidden_dim = 10
seq_len = 3
n_layers = 2

hidden = torch.rand((n_layers, bs, hidden_dim))
context = torch.rand((1, bs, hidden_dim))
x = torch.tensor([[1], [4]], dtype=torch.long)

decoder = Decoder(100, output_size=100)
x, hidden = decoder(x.T, hidden, context)
print(x.shape)
print(hidden.shape)

torch.Size([1, 2, 100])
torch.Size([2, 2, 10])


# Neuaral Machine Transalation

Now we build a Neural Machine Translation using the encoder-decoder approach without using the attention mechanism

In [18]:
class NMT(nn.Module):
    def __init__(
        self,
        en_vocab,
        sp_vocab,
        en_lang,
        sp_lang,
        embedding_dim,
        n_layers=2,
        hidden_dim=10,
    ):
        super().__init__()

        """
        Neural Machine Translation model (NMT) based on encoder-decoder architecture without attention.

        Parameters
        ----------
        en_vocab : int
            Size of the English vocabulary.
        sp_vocab : int
            Size of the Spanish vocabulary.
        en_lang : object
            English language object.
        sp_lang : object
            Spanish language object.
        embedding_dim : int
            Dimension of the word embedding space.
        n_layers : int, optional
            Number of layers in the encoder and decoder (default is 2).
        hidden_dim : int, optional
            Dimension of the hidden state in the encoder and decoder (default is 10).
        """

        self.en_vocab = en_vocab
        self.sp_vocab = sp_vocab
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.sp_lang = sp_lang
        self.en_lang = en_lang

        # Define Encoder and Decoder
        self.encoder = Encoder(en_vocab, embedding_dim, n_layers, hidden_dim)
        self.decoder = Decoder(sp_vocab, sp_vocab, embedding_dim, n_layers, hidden_dim)

    def forward(self, x, y):
        """
        Forward pass of the NMT model.

        Parameters
        ----------
        x : tensor
            Tensor of shape (seq_len, batch_size) containing the input sequences in English.
        y : tensor
            Tensor of shape (seq_len, batch_size) containing the input sequences in Spanish.

        Returns
        -------
        outputs : tensor
            Tensor of shape (seq_len, batch_size, sp_vocab) containing the predicted Spanish sequences.
        """

        # shape x: [seq_len, batch_size]
        # shape y: [seq_len, batch_size]

        target_len = y.shape[0]
        batch_size = x.shape[1]

        # outputs tensor
        # output shape: [seq_len, batch_size, vocab_size]
        outputs = torch.zeros(target_len, batch_size, self.sp_vocab).to(self.device)

        # Encoder
        # output(y_encoder) shape: [seq_len, batch_size, num_directions * hidden_dim]
        # output(hidden) shape: = [n_layers * n_directions, batch_size, hidden_dim]
        y_encoder, hidden = self.encoder(x)

        # Initial prediction
        # output shape: [1, batch_size]
        x_decoder = y[[0], :]

        # context vector
        # output(hidden) shape: [1, batch_size, num_directions * hidden_size]
        context = y_encoder[[-1], :, :]

        for t in range(1, target_len):
            # output(output) shape: [batch_size, output_size]
            # output(hidden) shape: = [n_layers * n_directions, batch_size, hidden_dim]
            output, hidden = self.decoder(x_decoder, hidden, context)
            outputs[[t], :, :] = output
            y_decoder = output.argmax(-1)
            x_decoder = y[[t], :] if np.random.random() < 0.5 else y_decoder

        # output shape: [seq_len, batch_size, vocab_size]
        return outputs

    def translate_sentence(self, x):
        """
        Translate an English sentence to Spanish.

        Parameters
        ----------
        x : str
            English sentence to translate.

        Returns
        -------
        outputs : list
            List of integers representing the predicted Spanish sequence.
        """

        self.eval()

        # Transform input text to tokens
        x = (
            torch.Tensor(self.en_lang.transform(x))
            .long()
            .reshape(-1, 1)
            .to(self.device)
        )

        # define output array
        outputs = []

        # Initial token <start>
        x_decoder = torch.Tensor([[1]]).long().to(self.device)

        # pass sentence to the encoder
        y_encoder, hidden = self.encoder(x)

        # Define cotext vector
        context = y_encoder[[-1], :, :]

        t = 1

        # this will run until prediction is <end> or t >= 200
        while x_decoder != 2:
            output, hidden = self.decoder(x_decoder, hidden, context)
            outputs.append(output.argmax(-1).item())
            x_decoder = y_decoder = output.argmax(-1)

            if t >= 200:
                break

            t += 1

        return self.sp_lang.inverse_transform(outputs)

    def config_model(self, device: str = "cuda"):
        """
        Configure the NMT model.

        Parameters
        ----------
        device : str, optional
            Device to use (default is "cuda").
        """

        # define device to operate
        self.device = device

        # set model's device
        self.to(self.device)

        # define loss function
        self.loss = nn.CrossEntropyLoss(ignore_index=0)

        # define optimizer
        self.optimizer = torch.optim.Adam(self.parameters())

    def train_one_epoch(self, train_loader):
        """
        Train the NMT model for one epoch.

        Parameters
        ----------
        train_loader : DataLoader
            DataLoader object containing the training data.

        Returns
        -------
        logs : dict
            Dictionary containing the training loss and BLEU score.
        """

        running_loss = 0
        bleu = 0

        self.train()

        bar = tqdm(train_loader, leave=True)

        for step, (x, y) in enumerate(bar, 1):
            self.optimizer.zero_grad()

            # set device
            x, y = x.to(self.device), y.to(self.device)

            # forward pass
            logits = self(x, y)  # shape: [seq_len, batch_size, vocab_size]

            # Remove <start> from target
            y = y[1:]
            logits = logits[1:]

            # Compute loss
            loss = self.loss(logits.reshape(-1, logits.shape[2]), y.reshape(-1))

            # Clip the gradient value is it exceeds > 1
            torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1)

            # Compute gradients
            loss.backward()

            # Update weigths
            self.optimizer.step()

            # compute running loss
            running_loss += loss.item()

            # predictions
            y_pred = logits.argmax(-1).detach().cpu().numpy()
            y_pred = [
                self.sp_lang.inverse_transform(y_pred[:, i]) for i in range(x.shape[1])
            ]

            # true labels
            y = y.detach().cpu().numpy()
            y = [self.sp_lang.inverse_transform(y[:, i]) for i in range(x.shape[1])]

            bleu += bleu_score(y_pred, y).item()

            bar.set_description(
                f"Train loss {round(running_loss/step, 3)}, "
                f"Train BLEU {round(bleu/step, 3)}"
            )

        logs = {
            "Train loss": round(running_loss / step, 3),
            "Train BLEU": round(bleu / step, 3),
        }

        return logs

    def test_one_epoch(self, test_loader):
        """
        Test the NMT model for one epoch.

        Parameters
        ----------
        test_loader : DataLoader
            DataLoader object containing the test data.

        Returns
        -------
        logs : dict
            Dictionary containing the test loss and BLEU score.
        """

        running_loss = 0
        bleu = 0

        self.eval()

        with torch.no_grad():
            bar = tqdm(test_loader, leave=True)

            for step, (x, y) in enumerate(bar, 1):
                self.optimizer.zero_grad()

                # set device
                x, y = x.to(self.device), y.to(self.device)

                # forward pass
                logits = self(x, y)  # shape: [seq_len, batch_size, vocab_size]

                # Remove <start> from target
                y = y[1:]
                logits = logits[1:]

                # Compute loss
                loss = self.loss(logits.reshape(-1, logits.shape[2]), y.reshape(-1))

                # compute running loss
                running_loss += loss.item()

                # predictions
                y_pred = logits.argmax(-1).detach().cpu().numpy()
                y_pred = [
                    self.sp_lang.inverse_transform(y_pred[:, i])
                    for i in range(x.shape[1])
                ]

                # true labels
                y = y.detach().cpu().numpy()
                y = [self.sp_lang.inverse_transform(y[:, i]) for i in range(x.shape[1])]

                bleu += bleu_score(y_pred, y).item()

                bar.set_description(
                    f"Test loss {round(running_loss/step, 3)}, "
                    f"Test BLEU {round(bleu/step, 3)}"
                )

                logs = {
                    "Test loss": round(running_loss / step, 3),
                    "Test BLEU": round(bleu / step, 3),
                }

                return logs

    def fit(self, train_loader, test_loader, epochs: int = 1):
        """
        Train and evalaute the model for N epochs

        Parameteres:
        -----------
        train_loader : DataLoader
            DataLoader object containing the training data.
        test_loader : DataLoader
            DataLoader object containing the test data.
        epochs: int
            Number of epochs to train and evalaute the data loader
        """

        bar = tqdm(range(epochs))

        for epoch in bar:
            train_logs = self.train_one_epoch(train_loader)
            test_logits = self.test_one_epoch(test_loader)

            logs = train_logs
            logs = logs.update(test_logits)

            print(self.translate_sentence("the man who sold the world"))

            bar.set_description(logs)


# Train and Evaluate de NMT model

In [19]:
# split data to trian and test
train_df, test_df = train_test_split(dataset, test_size=0.1, random_state=42)

# define train and test dataset
ds_train = CustomDataset(train_df)
ds_test = CustomDataset(test_df, sp_lang=ds_train.sp_lang, en_lang=ds_train.en_lang)

# define train and test datalaoder
loader_train  = torch.utils.data.DataLoader(ds_train, batch_size=64, num_workers=8, shuffle=True, collate_fn=Lang.right_padding_per_batch)
loader_test  = torch.utils.data.DataLoader(ds_test, batch_size=64, num_workers=8, shuffle=False, collate_fn=Lang.right_padding_per_batch)

#Define spanish and english vocab size
sp_vocab = ds_train.sp_lang.n_words
en_vocab = ds_train.en_lang.n_words

  0%|          | 0/125672 [00:00<?, ?it/s]



  0%|          | 0/125672 [00:00<?, ?it/s]

In [None]:
# instance de NMT model
nmt = NMT(en_vocab, sp_vocab, ds_train.en_lang, ds_train.sp_lang, embedding_dim=300, n_layers=2, hidden_dim=512)
nmt.config_model(device="cuda")

In [None]:
# fit the NMT model
nmt.fit(loader_train, loader_test, epochs = 10)

In [15]:
# test model performance on train set
nmt.test_one_epoch(loader_train)

  0%|          | 0/1955 [00:00<?, ?it/s]

{'Test loss': 1.68, 'Test BLEU': 0.195}

In [16]:
# Test model performance on test set
nmt.test_one_epoch(loader_test)

  0%|          | 0/218 [00:00<?, ?it/s]

{'Test loss': 2.216, 'Test BLEU': 0.126}

In [17]:
# Save the model
torch.save(nmt, "NMT-GRU-no-attention.pth")

In [18]:
# Load the model
nmt = torch.load("NMT-GRU-no-attention.pth")

In [32]:
# test the NMT with dummy a sentence
nmt.translate_sentence("the man who sold the world")

'el hombre dijo el mundo . <end>'

In [31]:
# Test more translations
for _, text in dataset.sample(5).iterrows():
    
    print("------------------------------------------------------------------------------------------------------")
    print("input text: ",text.english)
    print("translation: ",nmt.translate_sentence(text.english))
    print("-----------------------------------------------------------------------------------------------------\n")

------------------------------------------------------------------------------------------------------
input text:  tom is just like me.
translation:  tom es como yo . <end>
-----------------------------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------
input text:  i have the ace of hearts.
translation:  tengo as de picas . <end>
-----------------------------------------------------------------------------------------------------

------------------------------------------------------------------------------------------------------
input text:  if you drive carefully you'll avoid accidents.
translation:  si te vas con evitarás evitarás evitarás evitarás . <end>
-----------------------------------------------------------------------------------------------------

----------------------------------------------------------------------------------------------------