# Neural Machine Translation

General Reference: https://github.com/nyu-dl/NLP_DL_Lecture_Note/blob/master/lecture_note.pdf <br>
Original Notebook: https://github.com/nyu-dl/AMMI-2019-NLP-Part2

### Install packages

In [None]:
!pip install torch
!pip install subword-nmt
!pip install sacremoses
!pip install googletrans==3.1.0a0
!pip install pandas
!pip install sacrebleu
!pip install matplotlib
!pip install bertviz

### Set up Google Translate API for Comparison

https://github.com/ssut/py-googletrans

In [None]:
from googletrans import Translator
google_translator = Translator()

### Python imports

In [None]:
"""
To run this notebook in Google Colab, you need to the following first:
1. Go to "Runtime / Change runtime type", then select "GPU" in the "Hardware accelerator" drop-down list
2. Open this link: https://drive.google.com/drive/folders/1E07YaKths98YpoBCH2PjdtTPqOXgfdZB?usp=sharing
3. Then go to "Shared with me" in your Google Drive, right-click the "ALPS2022-NMT" folder
and select "Add shortcut to Drive"
"""

cpu = False
colab = True   # set to False to run locally and not from Google Colab
model_root = 'models'  # where new models will be saved

if colab:
    # Download the python files from the ALPS Github
    !wget https://raw.githubusercontent.com/naverlabseurope/ALPS2021-MT-LAB/ALPS2022/data.py
    !wget https://raw.githubusercontent.com/naverlabseurope/ALPS2021-MT-LAB/ALPS2022/models.py
    # Mount your Google Drive, which should contain a link to "ALPS2022-NMT"
    from google.colab import drive
    drive.flush_and_unmount()
    drive.mount('/content/drive')
    root_dir = '/content/drive/MyDrive/ALPS2022-NMT'
    # model_root = '/content/drive/MyDrive/ALPS2022-models' # uncomment to save your models to your Google Drive
    !ls {root_dir}/*
else:
    # Download the datasets and pre-trained models
    # Modify this script to download data in other language pairs than EN-FR
    !scripts/download-data.sh
    root_dir = '.'

import os
import sys
import data
import models
import numpy as np
import torch
import torch.nn as nn
import time
from collections import OrderedDict
from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt
from subword_nmt.apply_bpe import BPE   # segment text into subword units
%matplotlib inline

## The Dataset

We will work with a small English to French dataset from https://www.manythings.org/anki/. It contains translations of short and simple sentences aimed at foreign language learners (from the [Tatoeba collaborative database](https://tatoeba.org/en/)). Of course, models trained on this data will not perform well on longer, more sophisticated sentences. They also won't be very robust to domain shift and input noise. To train stronger models, some larger datasets can be downloaded from https://www.statmt.org/wmt21/ or https://opus.nlpl.eu/.

In [None]:
source_lang, target_lang = 'en', 'fr'
data_dir = os.path.join(root_dir, 'data')
pretrained_model_dir = os.path.join(root_dir, 'pretrained_models', f'{source_lang}-{target_lang}')
model_dir = os.path.join(model_root, f'{source_lang}-{target_lang}')
!mkdir -p {model_dir}
!head -5 {data_dir}/train.en-fr.en

## Load and preprocess the data

1. Load the BPE model
2. Load the parallel corpora for this language pair (train, valid and test). `load_data` will load a corpus and tokenize it with the BPE model with the `preprocess` function.
3. Create (or load) dictionaries that map BPE tokens to token IDs (`data.load_or_create_dictionary` function)
4. Binarize the data: map source and target text sequences to sequences of IDs, and sort the training set by length (`data.binarize` function)
5. Create batches (`data.BatchIterator` class): group multiple sequence pairs of similar length together, pad them to the maximum length and create numpy arrays that can be used to train our models

In [None]:
# set the random seed: initialize the random number generator for reproducibility
def reset_seed(seed=1234):
    np.random.seed(seed)
    torch.manual_seed(seed)

#### 1. Load the BPE model (multilingual BPE model, works with French, German and English)

In [None]:
bpe_path = os.path.join(data_dir, 'bpecodes.de-en-fr')

with open(bpe_path) as bpe_codes:
    bpe_model = BPE(bpe_codes)

def preprocess(line, is_source=True, source_lang=None, target_lang=None):
    return bpe_model.segment(line.lower())

def postprocess(line):
    return line.replace('@@ ', '')

def load_data(source_lang, target_lang, split='train', max_size=None):
    # max_size: max number of sentence pairs in the training corpus (None = all)
    path = os.path.join(data_dir, f'{split}.{source_lang}-{target_lang}')
    return data.load_dataset(path, source_lang, target_lang, preprocess=preprocess, max_size=max_size)   # set max_size to 10000 for fast debugging

#### 2. Load and preprocess the parallel corpora (these are pandas DataFrames)

In [None]:
train_data = load_data(source_lang, target_lang, 'train', max_size=None)   # set max_size to 10000 for fast debugging
valid_data = load_data(source_lang, target_lang, 'valid')
test_data = load_data(source_lang, target_lang, 'test')
print(train_data.iloc[:5])   # to see the first 5 rows of train_data

#### 3. Load or create the dictionaries

In [None]:
source_dict_path = os.path.join(pretrained_model_dir, f'dict.{source_lang}.txt')
target_dict_path = os.path.join(pretrained_model_dir, f'dict.{target_lang}.txt')

source_dict = data.load_or_create_dictionary(
    source_dict_path,
    train_data['source_tokenized'],
    reset=False,    # set reset to True if you're changing the data or the preprocessing
)
print(source_dict.words[:100])   # print the first 100 words in the source vocabulary

target_dict = data.load_or_create_dictionary(
    target_dict_path,
    train_data['target_tokenized'],
    reset=False,
)
print(target_dict.words[:100])

In [None]:
print('source vocab size:', len(source_dict))
print('target vocab size:', len(target_dict))

#### 4. Use the dictionaries to map tokens to indices. The training set is also sorted by length for more efficient batching.

In [None]:
data.binarize(train_data, source_dict, target_dict, sort=True)
data.binarize(valid_data, source_dict, target_dict, sort=False)
data.binarize(test_data, source_dict, target_dict, sort=False)
print(train_data.iloc[:5])  # print the first 5 rows of train_data
# the 'source_bin' and 'target_bin' columns contain the binarized data. Indices of 1 correspond to the EOS token

#### Data statistics:

In [None]:
print('train_size={}, valid_size={}, test_size={}, min_len={}, max_len={}, avg_len={:.1f}'.format(
    len(train_data),
    len(valid_data),
    len(test_data),
    train_data['source_len'].min(),
    train_data['source_len'].max(),
    train_data['source_len'].mean(),
))

print('Train source length distribution:')
# The 90th percentile indicates the point where 90% percent of the data have values lower than this number.
# We see that 90% of training examples have 14 source words or less.
print(train_data['source_len'].quantile([0.5, 0.75, 0.9, 0.95, 0.99, 0.999, 0.9999]))

#### 5. Build batches. The training batches are automatically shuffled before each epoch

In [None]:
max_len = 30       # maximum 30 tokens per sentence (longer sequences will be truncated)
batch_size = 512   # maximum 512 tokens per batch (decrease if you get OOM errors, increase to speed up training)

reset_seed()

train_iterator = data.BatchIterator(train_data, source_lang, target_lang, batch_size=batch_size, max_len=max_len, shuffle=True)
valid_iterator = data.BatchIterator(valid_data, source_lang, target_lang, batch_size=batch_size, max_len=max_len, shuffle=False)
test_iterator = data.BatchIterator(test_data, source_lang, target_lang, batch_size=batch_size, max_len=max_len, shuffle=False)

#### Example of training batch:

In [None]:
print(next(iter(train_iterator)))

The Seq2Seq Model
=================

A Recurrent Neural Network, or RNN, is a network that operates on a
sequence and uses its own output as input for subsequent steps.

A `Sequence to Sequence network <http://arxiv.org/abs/1409.3215>`__, or
seq2seq network, or `Encoder-Decoder
network <https://arxiv.org/pdf/1406.1078v3.pdf>`__, is a model
consisting of usually two RNNs called the encoder and decoder. The encoder reads
an input sequence and outputs a single vector, and the decoder reads
that vector to produce an output sequence. Essentially, all we need is some mechanism to read the source sentence and create an encoding and some mechanism to read the encoding and decode it to the target language. 

Unlike sequence prediction with a single RNN, where every input
corresponds to an output, the seq2seq model frees us from sequence
length and order, which makes it ideal for translation between two
languages.

Consider the sentence "I am not the
black cat" → "Je ne suis pas le chat noir". Most of the words in the input sentence have a direct
translation in the output sentence, but are in slightly different
orders, e.g. "chat noir" and "black cat". Because of the "ne/pas"
construction there is also one more word in the input sentence. It would
be difficult to produce a correct translation directly from the sequence
of input words.

With a seq2seq model the encoder creates a single vector which, in the
ideal case, encodes the meaning of the input sequence into a single
vector — a single point in some N dimensional space of sentences.


The Encoder
-----------

The encoder is anything which takes in a sentence and gives us a representation for the sentence. 

The encoder of a seq2seq network can be a RNN that outputs some value for
every word from the input sentence. For every input word the encoder
outputs a vector and a hidden state, and uses the hidden state for the
next input word.

However, we will start with a simpler Bag-of-Words encoder and then move on to more complex encoders.

### Bag-of-Words Encoder

In [None]:
bow_encoder = models.BOW_Encoder(
    input_size=len(source_dict),
    hidden_size=512,
    num_layers=1,
    dropout=0.1,
    reduce='sum',
)

In [None]:
print(bow_encoder)

The Decoder
--------------------

The decoder is another network that takes the encoder output vector(s) and outputs a sequence of words to create the translation.

### Decoder without Attention

In the simplest seq2seq decoder we use only the last output of the encoder. This last output is sometimes called the context vector as it encodes context from the entire sequence. This context vector can be used as the initial hidden state for an RNN decoder.

At every step of decoding, the decoder is given an input token and hidden state. The initial input token is the start-of-string <SOS> token, and the first hidden state is the context vector (the encoder's last hidden state).

In [None]:
bow_decoder = models.RNN_Decoder(
    output_size=len(target_dict),
    hidden_size=512,
    num_layers=1,
    dropout=0.1,
)

In [None]:
print(bow_decoder)

In [None]:
bow_model = models.EncoderDecoder(
    bow_encoder,
    bow_decoder,
    lr=0.001,
    use_cuda=not cpu,
    target_dict=target_dict,
)

### Training code

In [None]:
def evaluate_model(model, *test_or_valid_iterators, record=False):
    """
    model: instance of models.EncoderDecoder
    test_or_valid_iterators: list of data.BatchIterator
    record: save scores in the model checkpoint
    """
    scores = []
    
    # Compute chrF over all test or validation sets
    for iterator in test_or_valid_iterators:
        src, tgt = iterator.source_lang, iterator.target_lang
        loss = 0
        for batch in iterator:
            loss += model.eval_step(batch) / len(iterator)
        translation_output = model.translate(iterator, postprocess)
        score = translation_output.score
        output = translation_output.output

        print(f'{src}-{tgt}: loss={loss:.2f}, chrF={score:.2f}')

        if record:
            model.record(f'{src}_{tgt}_loss', loss)
            model.record(f'{src}_{tgt}_chrf', score)
        
        scores.append(score)

    # Average the validation chrF scores
    score = sum(scores) / len(scores)
    if len(scores) > 1:
        print(f'chrF={score:.2f}')

    return score


def train_model(train_iterator, valid_iterators, model, checkpoint_path, epochs=10):
    """
    train_iterator: instance of data.BatchIterator or data.MultiBatchIterator
    valid_iterators: list of data.BatchIterator
    model: instance of models.EncoderDecoder
    checkpoint_path: path of the model checkpoint
    epochs: iterate this many times over train_iterator
    """
    epochs += model.epoch

    reset_seed()

    best_score = -1
    for epoch in range(model.epoch + 1, epochs + 1):

        start = time.time()
        running_loss = 0

        print(f'Epoch [{epoch}/{epochs}]')

        # Iterate over training batches for one epoch
        with tqdm(enumerate(train_iterator), total=len(train_iterator)) as t:

            for i, batch in t:
                running_loss += model.train_step(batch)
                t.postfix = f' loss={running_loss / (i + 1):.3f}'

        # Average training loss for this epoch
        epoch_loss = running_loss / len(train_iterator)

        print(f'loss={epoch_loss:.3f}, time={time.time() - start:.2f}')
        model.record('train_loss', epoch_loss)

        score = evaluate_model(model, *valid_iterators, record=True)

        # Update the model's learning rate based on current performance.
        # This scheduler divides the learning rate by 10 if chrF does not improve.
        model.scheduler_step(score)

        # Save a model checkpoint if it has the best validation chrF so far
        if score > best_score:
            best_score = score
            model.save(checkpoint_path)

        print('=' * 50)

    print(f'Training completed. Best chrF is {best_score:.2f}')

### Train a model with BOW Encoder and RNN Decoder (or load a pre-trained model)

In [None]:
# Set this value to True to train your own model. By default, a pre-trained model will be loaded.
# Tip: you can set "epochs" to a small value (e.g., 2) and re-run this cell several times to continue training you model (`train_model` does not reset the model)
train_again = False

if train_again:
    checkpoint_path = os.path.join(model_dir, 'bow.pt')
else:
    checkpoint_path = os.path.join(pretrained_model_dir, 'bow.pt')

print('checkpoint path:', checkpoint_path)

if os.path.exists(checkpoint_path) and not train_again:
    bow_model.load(checkpoint_path)   # trained for 10 epochs
else:
    train_model(train_iterator, [valid_iterator], bow_model,
                epochs=2,
                checkpoint_path=checkpoint_path)

### Compute chrF on the test set

In [None]:
chrf = evaluate_model(bow_model, test_iterator)

### Interact with the model

In [None]:
def show_attention(input_sentence, output_words, attentions):
    # Plot an encoder-decoder attention matrix
    fig = plt.figure()
    ax = fig.add_subplot(111)
    fig.colorbar(ax.matshow(attentions, cmap='bone', aspect='auto'))
    xlabels = input_sentence.split() + [data.EOS_TOKEN]
    ylabels = output_words.split() + [data.EOS_TOKEN]
    ax.set_xticks(range(len(xlabels)))
    ax.set_xticklabels(xlabels, rotation=90)
    ax.set_yticks(range(len(ylabels)))
    ax.set_yticklabels(ylabels)
    plt.show()


def encode_as_batch(sentence, dictionary, source_lang, target_lang):
    # Create a batch from a single sentence
    sentence = f'{sentence} {data.EOS_TOKEN}'
    tensor = dictionary.txt2vec(sentence).unsqueeze(0)
    return {
        'source': tensor,
        'source_len': torch.from_numpy(np.array([tensor.shape[-1]])),
        'source_lang': source_lang,
        'target_lang': target_lang,
    }


def get_translation(model, sentence, dictionary, source_lang, target_lang, return_output=False):
    # Translate given sentence with given model. Also show translation outputs by Google Translate for comparison.
    print('Source:', sentence)
    sentence_tok = preprocess(sentence, is_source=True, source_lang=source_lang, target_lang=target_lang)
    print('Tokenized source:', sentence_tok)
    batch = encode_as_batch(sentence_tok, dictionary, source_lang, target_lang)
    prediction, attn_matrix, enc_self_attn = model.decoding_step(batch)
    prediction = prediction[0]
    prediction_detok = postprocess(prediction)
    print('Prediction:', prediction)
    print('Detokenized prediction:', prediction_detok)

    print('Google Translate ({}->{}): {}'.format(
        source_lang,
        target_lang,
        google_translator.translate(sentence, src=source_lang, dest=target_lang).text,
    ))
    print('Google Translate on prediction ({}->{}): {}'.format(
        target_lang,
        source_lang,
        google_translator.translate(prediction_detok, src=target_lang, dest=source_lang).text,
    ))

    results = {
        'source': sentence,
        'source_tokens': sentence_tok.split() + ['<eos>'],
        'prediction_detok': prediction_detok,
        'prediction_tokens': prediction.split(),
    }

    if attn_matrix is not None:
        attn_matrix = attn_matrix[0].detach().cpu().numpy()
        results['attention_matrix'] = attn_matrix
        show_attention(sentence_tok, prediction, attn_matrix)
    
    if enc_self_attn is not None:
        results['encoder_self_attention_list'] = enc_self_attn
    
    if return_output:
        return results

In [None]:
get_translation(bow_model, 'hello how are you ?', source_dict, source_lang, target_lang)

The biggest limitation of a Bag-of-Word encoder is that it is insensitive to word order: <br>
when shuffling the words in the previous sentence, you get the same output.

In [None]:
get_translation(bow_model, 'are hello ? how you', source_dict, source_lang, target_lang)

In [None]:
get_translation(bow_model, "she 's five years older than me .", source_dict, source_lang, target_lang)

## RNN Encoder + RNN Decoder

In [None]:
rnn_encoder = models.RNN_Encoder(
    input_size=len(source_dict),
    hidden_size=512,
    num_layers=1,
    dropout=0.1,
)

In [None]:
print(rnn_encoder)

In [None]:
rnn_decoder = models.RNN_Decoder(
    output_size=len(target_dict),
    hidden_size=512,
    num_layers=1,
    dropout=0.1,
)

In [None]:
print(rnn_decoder)

In [None]:
rnn_model = models.EncoderDecoder(
    rnn_encoder,
    rnn_decoder,
    lr=0.001,
    use_cuda=not cpu,
    target_dict=target_dict,
)

### Train a model with RNN Encoder and RNN Decoder (or load a pre-trained model)

In [None]:
# Set this value to True to train your own model. By default, a pre-trained model will be loaded.
# Tip: you can set "epochs" to a small value (e.g., 2) and re-run this cell several times to continue training you model (`train_model` does not reset the model)
train_again = False

if train_again:
    checkpoint_path = os.path.join(model_dir, 'rnn.pt')
else:
    checkpoint_path = os.path.join(pretrained_model_dir, 'rnn.pt')

print('checkpoint path:', checkpoint_path)

if os.path.exists(checkpoint_path) and not train_again:
    rnn_model.load(checkpoint_path)   # trained for 10 epochs
else:
    train_model(train_iterator, [valid_iterator], rnn_model,
                epochs=2,
                checkpoint_path=checkpoint_path)

### Compute chrF on the test set

In [None]:
chrf = evaluate_model(rnn_model, test_iterator)

### Interact with the model

In [None]:
get_translation(rnn_model, 'hello how are you ?', source_dict, source_lang, target_lang)

Contrary to the BoW encoder, an RNN is sensitive to word ordering

In [None]:
get_translation(rnn_model, 'are hello ? how you', source_dict, source_lang, target_lang)

In [None]:
get_translation(rnn_model, "she 's five years older than me .", source_dict, source_lang, target_lang)

In [None]:
get_translation(rnn_model, 'i know that the last thing you want to do is help me .', source_dict, source_lang, target_lang)

## RNN Encoder + RNN Decoder with Encoder-Decoder Attention

In [None]:
rnn_attn_encoder = models.RNN_Encoder(
    input_size=len(source_dict),
    hidden_size=512,
    num_layers=1,
    dropout=0.1,
)

In [None]:
 print(rnn_attn_encoder)

In [None]:
rnn_attn_decoder = models.AttentionDecoder(
    output_size=len(target_dict),
    hidden_size=512,
    num_layers=1,
    dropout=0.1,
)

In [None]:
print(rnn_attn_decoder)

In [None]:
rnn_attn_model = models.EncoderDecoder(
    rnn_attn_encoder,
    rnn_attn_decoder,
    lr=0.0005,
    use_cuda=not cpu,
    target_dict=target_dict,
)

### Train a model with RNN Encoder and RNN Decoder with attention (or load a pre-trained model)

In [None]:
# Set this value to True to train your own model. By default, a pre-trained model will be loaded.
# Tip: you can set "epochs" to a small value (e.g., 2) and re-run this cell several times to continue training you model (`train_model` does not reset the model)
train_again = False

if train_again:
    checkpoint_path = os.path.join(model_dir, 'rnn-attn.pt')
else:
    checkpoint_path = os.path.join(pretrained_model_dir, 'rnn-attn.pt')

print('checkpoint path:', checkpoint_path)

if os.path.exists(checkpoint_path) and not train_again:
    rnn_attn_model.load(checkpoint_path)   # trained for 10 epochs
else:
    train_model(train_iterator, [valid_iterator], rnn_attn_model,
                epochs=2,
                checkpoint_path=checkpoint_path)

### Compute chrF on the test set

In [None]:
chrf = evaluate_model(rnn_attn_model, test_iterator)

### Interact with the model and visualize attention matrices

In [None]:
get_translation(rnn_attn_model, 'hello how are you ?', source_dict, source_lang, target_lang)

In [None]:
get_translation(rnn_attn_model, "she 's five years older than me .", source_dict, source_lang, target_lang)

In [None]:
get_translation(rnn_attn_model, 'i know that the last thing you want to do is help me .', source_dict, source_lang, target_lang)

## Transformer Model

[Transformer](https://arxiv.org/abs/1706.03762) is currently the state-of-the-art for Machine Translation. The encoder uses self-attention over the previous layers. The decoder combines self-attention and encoder-decoder attention.

In [None]:
transformer_encoder = models.TransformerEncoder(
    input_size=len(source_dict),
    hidden_size=512,
    num_layers=2,
    dropout=0.1,
    heads=4,
)

In [None]:
print(transformer_encoder)

In [None]:
transformer_decoder = models.TransformerDecoder(
    output_size=len(target_dict),
    hidden_size=512,
    num_layers=1,
    heads=4,
    dropout=0.1,
)

In [None]:
print(transformer_decoder)

In [None]:
transformer_model = models.EncoderDecoder(
    transformer_encoder,
    transformer_decoder,
    lr=0.0005,
    use_cuda=not cpu,
    target_dict=target_dict,
)

### Train a Transformer model (or load a pre-trained model)

In [None]:
# Set this value to True to train your own model. By default, a pre-trained model will be loaded.
# Tip: you can set "epochs" to a small value (e.g., 2) and re-run this cell several times to continue training you model (`train_model` does not reset the model)
train_again = False

if train_again:
    checkpoint_path = os.path.join(model_dir, 'transformer.pt')
else:
    checkpoint_path = os.path.join(pretrained_model_dir, 'transformer.pt')

print('checkpoint path:', checkpoint_path)

if os.path.exists(checkpoint_path) and not train_again:
    transformer_model.load(checkpoint_path)   # trained for 10 epochs
else:
    train_model(train_iterator, [valid_iterator], transformer_model,
                epochs=2,
                checkpoint_path=checkpoint_path)

### Compute chrF on the test set

In [None]:
chrf = evaluate_model(transformer_model, test_iterator)

### Interact with the model

In [None]:
# bertviz: tool for visualizing attention in the Transformer model
from bertviz import head_view, model_view

In [None]:
%%javascript
require.config({
  paths: {
      d3: '//cdnjs.cloudflare.com/ajax/libs/d3/5.7.0/d3.min',
    jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
  }
});

In [None]:
def show_head_view(results):
    self_attention = results['encoder_self_attention_list']
    tokens = results['source_tokens']
    sentence_b_start = None
    head_view(self_attention, tokens, sentence_b_start)

def show_model_view(results):
    self_attention = results['encoder_self_attention_list']
    tokens = results['source_tokens']
    sentence_b_start = None
    model_view(self_attention, tokens, sentence_b_start)

In [None]:
results = get_translation(transformer_model, 'hello how are you ?', source_dict, source_lang, target_lang, return_output=True)

In [None]:
show_head_view(results)

In [None]:
show_model_view(results)

In [None]:
results = get_translation(transformer_model, "she 's five years older than me .", source_dict, source_lang, target_lang, return_output=True)

In [None]:
results = get_translation(transformer_model, 'i know that the last thing you want to do is help me .', source_dict, source_lang, target_lang, return_output=True)

## Multilingual Transformer model

Load a pre-trained **de, fr <-> en** model. The same dictionary and embeddings are shared between all languages, and language codes (`<lang:de>`, `<lang:en>`, `<lang:fr>`) are prepended to each source sequence to identify the target language.

In [None]:
multi_model_dir = os.path.join(root_dir, 'pretrained_models', 'de-en-fr')

multi_dict = data.Dictionary.load(os.path.join(multi_model_dir, 'dict.txt'))

encoder = models.TransformerEncoder(input_size=len(multi_dict), hidden_size=512, num_layers=2, heads=4)
decoder = models.TransformerDecoder(output_size=len(multi_dict), hidden_size=512, num_layers=1, heads=4)
decoder.embedding = encoder.embedding
multi_model = models.EncoderDecoder(encoder, decoder, lr=0.0005, use_cuda=not cpu, target_dict=multi_dict)

checkpoint_path = os.path.join(multi_model_dir, 'transformer.pt')
multi_model.load(checkpoint_path)


### Multilingual evaluation

Modify the `preprocess` function to automatically prepend language codes to all source sequences (when calling `get_translation`, or `load_data`).

And load test sets in all language pairs.

In [None]:
def preprocess(line, is_source=True, source_lang=None, target_lang=None):
    line = bpe_model.segment(line.lower())
    if is_source:
        line = f'<lang:{target_lang}> {line}'
    return line

test_sets = OrderedDict()

for pair in 'en-fr', 'fr-en', 'en-de', 'de-en', 'de-fr', 'fr-de':
    src, tgt = pair.split('-')
    dataset = load_data(src, tgt, 'test')
    data.binarize(dataset, source_dict=multi_dict, target_dict=multi_dict, sort=False)
    iterator = data.BatchIterator(dataset, src, tgt, batch_size=512, max_len=30, shuffle=False)
    test_sets[pair] = iterator
    
en_centric_test_sets = list(test_sets.values())[:4]
non_en_centric_test_sets = list(test_sets.values())[4:]

In [None]:
chrf = evaluate_model(multi_model, *en_centric_test_sets)

### Interact with the model

In [None]:
get_translation(multi_model, "she 's five years older than me .", multi_dict, source_lang='en', target_lang='fr')

In [None]:
get_translation(multi_model, 'sie ist fünf jahre älter als ich .', multi_dict, source_lang='de', target_lang='en')

### Zero-shot translation

In theory, the model can do **zero-shot** translation, i.e., translate between German and French even though it has never seen German-French sentence pairs during training.

In [None]:
chrf = evaluate_model(multi_model, *non_en_centric_test_sets)

#### However, in practice zero-shot performance is very bad. Interact with the model to understand why.

In [None]:
get_translation(multi_model, 'sie ist fünf jahre älter als ich .', multi_dict, 'de', 'fr')

In [None]:
get_translation(multi_model, 'elle a cinq ans de plus que moi .', multi_dict, 'fr', 'de')

## Your Turn!

Choose one of these exercises, or both!

### Hyper-parameter tuning

Find the best hyper-parameters for Transformer **en-fr**. Share your best test chrF scores on Slack!

*Don't forget to reload the `preprocess` function at the start of the notebook*

- Hyper-parameters: `lr`, `batch_size`, `num_layers`, `hidden_size`, `dropout`, `heads`, etc.
- Other improvements: modify the learning rate scheduler and optimizer in `models.EncoderDecoder`; use different embedding size and hidden size, etc.

### Multilingual NMT

Train your own multilingual NMT model.

Tips:
- Create a multilingual dictionary by concatenating the tokenized data in all languages. Or simply re-use the dictionary of the pre-trained model (`multi_dict`).
- Use the same dictionary for the source and target sides, and share the embeddings between your encoder and decoder (do: `decoder.embedding = encoder.embedding`).
- Use `data.MultiBatchIterator(iterator_list)` to concatenate a list of training iterators (one for each language pair) into a single iterator, which is compatible with `train_model`.
- `train_model` can take a list of several validation iterators, which will let you validate your model on several language pairs.
- Improve your model's performance on **de-fr** and **fr-de** by including training data for these languages pairs (`data/train.de-fr.de` and `data/train.de-fr.fr`).

### Train bilingual or multilingual models on other language pairs

- Modify and re-run `./download-data.sh` to download data in new languages, preprocess this data and train BPE models

In [None]:
def plot_loss(model):
    from statistics import mean
    from matplotlib import pyplot as plt
    metrics = model.metrics
    epochs = sorted(metrics.keys())
    train_loss = [metrics[epoch]['train_loss'] for epoch in epochs]
    valid_loss = [
        mean(v for k, v in metrics[epoch].items() if 'loss' in k and k != 'train_loss')
        for epoch in epochs
    ]
    chrf = [
        mean(v for k, v in metrics[epoch].items() if 'chrf' in k)
        for epoch in epochs
    ]
    
    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    ax1.plot(epochs, train_loss, linestyle='solid', label='Train loss')
    ax1.plot(epochs, valid_loss, linestyle='dashdot', label='Valid loss')
    ax2.plot(epochs, chrf, 'g--', label='Valid chrF')
    ax1.set_xlabel('Epochs')
    ax1.set_ylabel('Loss')
    ax2.set_ylabel('chrF')
    ax1.legend(loc='upper left')
    ax2.legend(loc='upper right')

In [None]:
plot_loss(multi_model)

In [None]:
# Load DE-FR training data
src, tgt = 'de', 'fr'

train_dataset = load_data(src, tgt, 'train', max_size=None)   # set max_size to 10000 for fast debugging
valid_dataset = load_data(src, tgt, 'valid')

data.binarize(train_dataset, source_dict=multi_dict, target_dict=multi_dict, sort=True)
data.binarize(valid_dataset, source_dict=multi_dict, target_dict=multi_dict, sort=False)

train_iterator = data.BatchIterator(train_dataset, src, tgt, batch_size=512, max_len=30, shuffle=True)
valid_iterator = data.BatchIterator(valid_dataset, src, tgt, batch_size=512, max_len=30, shuffle=False)

In [None]:
# Finetune the entire model on EN-FR
new_checkpoint_path = os.path.join(model_root, 'de-en-fr', 'finetuned-transformer.pt')
train_model(train_iterator, [valid_iterator], multi_model, new_checkpoint_path, epochs=1)

In [None]:
# Now evaluate on FR-EN and DE-FR test sets. We see a decrease in FR-EN performance (catastrophic forgetting)
chrf = evaluate_model(multi_model, test_sets['fr-en'], test_sets['de-fr'])

In [None]:
from models import AdapterTransformerDecoder, AdapterTransformerEncoderLayer

class AdapterLayer(nn.Module):
    # This class definition is just for show. Adapter layers are actually defined in models.py
    def __init__(self, input_dim, projection_dim):
        super().__init__()
        self.down = nn.Linear(input_dim, projection_dim)
        self.up = nn.Linear(projection_dim, input_dim)
        self.layer_norm = nn.LayerNorm(input_dim)
        nn.init.uniform_(self.down.weight, -1e-6, 1e-6)
        nn.init.uniform_(self.up.weight, -1e-6, 1e-6)
        nn.init.zeros_(self.down.bias)
        nn.init.zeros_(self.up.bias)

    def forward(self, x):
        y = self.layer_norm(x)
        y = self.down(y)
        y = F.relu(y)
        y = self.up(y)
        return x + y

class AdapterTransformerEncoder(models.TransformerEncoder):
    def __init__(self, adapter_ids, projection_dim, *args, **kwargs):
        self.adapter_ids = adapter_ids
        self.projection_dim = projection_dim
        super().__init__(*args, **kwargs)
        for name, param in self.named_parameters():
            if '.adapters.' not in name:
                param.requires_grad = False

    def select_adapter(self, id):
        for layer in self.layers:
            layer.adapter_id = id

    # This method can be modified to add adapters only at some layers (e.g., first encoder layer)
    # Use models.TransformerEncoderLayer instead for standard Trandformer layers
    def build_layer(self, layer_id):
        return AdapterTransformerEncoderLayer(
            self.adapter_ids,
            self.projection_dim,
            self.hidden_size,
            self.heads,
            self.dropout
        )


In [None]:
encoder = AdapterTransformerEncoder(
    adapter_ids=['de-fr'],   # you can create adapters for more than one language pair
    projection_dim=64,       # bottleneck dimension of the adapters
    input_size=len(multi_dict),
    hidden_size=512,
    num_layers=2,
    heads=4,
)
decoder = AdapterTransformerDecoder(
    adapter_ids=['de-fr'],
    projection_dim=64,
    output_size=len(multi_dict),
    hidden_size=512,
    num_layers=1,
    heads=4,
)
decoder.embedding = encoder.embedding
adapter_model = models.EncoderDecoder(encoder, decoder, lr=0.0005, use_cuda=not cpu, target_dict=multi_dict)

pretrained_checkpoint_path = os.path.join(multi_model_dir, 'transformer.pt')
adapter_model.load(pretrained_checkpoint_path, strict=False, reset_optimizer=True)

new_checkpoint_path = os.path.join(model_root, 'de-en-fr', 'adapter-transformer.pt')

# Show the number of trained parameters.
# All Transformer parameters are frozen except the adapter parameters.
total_params = 0
trained_params = 0
for name, param in adapter_model.named_parameters():
    total_params += param.numel()
    if param.requires_grad:
        trained_params += param.numel()
print(f'Total parameters: {total_params}, trained parameters: {trained_params}')

In [None]:
# Activate the DE-FR adapters and train them on the DE-FR data (the other parameters are frozen)
# Note that you can do encoder.select_adapter(None) to train only decoder adapters
encoder.select_adapter('de-fr')
decoder.select_adapter('de-fr')
train_model(train_iterator, [valid_iterator], adapter_model, new_checkpoint_path, epochs=1)

In [None]:
# Activate the DE-FR adapters to translate in the DE-FR direction
encoder.select_adapter('de-fr')
decoder.select_adapter('de-fr')
chrf = evaluate_model(adapter_model, test_sets['fr-en'], test_sets['de-fr'])

In [None]:
# Deactivate the adapters to use the initial model (e.g., to translate in the English-centric directions).
# With just 200k new parameters (1.4% of the initial model's size) we can adapt to the DE-FR direction,
# without hurting performance on the other language pairs.
encoder.select_adapter(None)
decoder.select_adapter(None)
chrf = evaluate_model(adapter_model, test_sets['fr-en'], test_sets['de-fr'])