# **Neural Machine Translation using FairSeq**

The Transformer, introduced in the paper [Attention Is All You Need](https://arxiv.org/abs/1706.03762), is a powerful sequence-to-sequence modeling architecture capable of producing state-of-the-art neural machine translation (NMT) systems.

In [None]:
!pip install fairseq

Collecting fairseq
  Downloading fairseq-0.12.2.tar.gz (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hydra-core<1.1,>=1.0.7 (from fairseq)
  Downloading hydra_core-1.0.7-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.8/123.8 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting omegaconf<2.1 (from fairseq)
  Downloading omegaconf-2.0.6-py3-none-any.whl (36 kB)
Collecting sacrebleu>=1.4.12 (from fairseq)
  Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Collecting bitarray (from fairse

In [None]:
%%bash
pip install bitarray fastBPE hydra-core omegaconf regex requests sacremoses subword_nmt

Collecting fastBPE
  Downloading fastBPE-0.1.0.tar.gz (35 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 897.5/897.5 kB 14.0 MB/s eta 0:00:00
Collecting subword_nmt
  Downloading subword_nmt-0.3.8-py3-none-any.whl (27 kB)
Collecting mock (from subword_nmt)
  Downloading mock-5.1.0-py3-none-any.whl (30 kB)
Building wheels for collected packages: fastBPE
  Building wheel for fastBPE (setup.py): started
  Building wheel for fastBPE (setup.py): finished with status 'done'
  Created wheel for fastBPE: filename=fastBPE-0.1.0-cp310-cp310-linux_x86_64.whl size=806587 sha256=d50d83084b184481ddad5402d890a5848680c3d4faf2b6219de33a8474db6b5b
  Stored in directory: /root/.cache/pip/wheels/13/5d/b9/4b8897941ebc9e8c6cc3f3ffd3ea5115731754269205098754
Successfully built fastBPE
Installing collected packages: fastBPE, sacr

To translate from English to French using the model from the paper [Scaling Neural Machine Translation](https://arxiv.org/abs/1806.00187)

In [None]:
import torch

# Load an En-Fr Transformer model trained on WMT'14 data :
en2fr = torch.hub.load('pytorch/fairseq', 'transformer.wmt14.en-fr', tokenizer='moses', bpe='subword_nmt')

# Use the GPU (optional):
en2fr.cuda()

Downloading: "https://github.com/pytorch/fairseq/zipball/main" to /root/.cache/torch/hub/main.zip
  dictionaries = [ (Dictionary.load(f"{label_dir}/dict.{label}.txt") if label is not "" else None ) for label in self.cfg.labels]
INFO:root:running build_ext
INFO:root:building 'fairseq.libbleu' extension
INFO:root:creating build
INFO:root:creating build/temp.linux-x86_64-cpython-310
INFO:root:creating build/temp.linux-x86_64-cpython-310/fairseq
INFO:root:creating build/temp.linux-x86_64-cpython-310/fairseq/clib
INFO:root:creating build/temp.linux-x86_64-cpython-310/fairseq/clib/libbleu
INFO:root:x86_64-linux-gnu-gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/usr/include/python3.10 -c fairseq/clib/libbleu/libbleu.cpp -o build/temp.linux-x86_64-cpython-310/fairseq/clib/libbleu/libbleu.o -std=c++11 -O3 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"

Compiling fairseq/data/data_utils_fast.pyx because it changed.
[1/1] Cythonizing fairseq/data/data_utils_fast.pyx


INFO:root:building 'fairseq.data.data_utils_fast' extension
INFO:root:creating build/temp.linux-x86_64-cpython-310/fairseq/data
INFO:root:x86_64-linux-gnu-gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/usr/local/lib/python3.10/dist-packages/numpy/core/include -I/usr/local/lib/python3.10/dist-packages/numpy/core/include -I/usr/include/python3.10 -c fairseq/data/data_utils_fast.cpp -o build/temp.linux-x86_64-cpython-310/fairseq/data/data_utils_fast.o -std=c++11 -O3 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -DTORCH_EXTENSION_NAME=data_utils_fast -D_GLIBCXX_USE_CXX11_ABI=0
INFO:root:creating build/lib.linux-x86_64-cpython-310/fairseq/data
INFO:root:x86_64-linux-gnu-g++ -shared -Wl,-O1 -Wl,-Bsymbolic-functions -Wl,-Bsymbolic-functions -g -fwrapv -O2 build/temp.linux-x86_64-cpython-310/fairseq/d

Compiling fairseq/data/token_block_utils_fast.pyx because it changed.
[1/1] Cythonizing fairseq/data/token_block_utils_fast.pyx


INFO:root:building 'fairseq.data.token_block_utils_fast' extension
INFO:root:x86_64-linux-gnu-gcc -Wno-unused-result -Wsign-compare -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -g -fwrapv -O2 -fPIC -I/usr/local/lib/python3.10/dist-packages/numpy/core/include -I/usr/local/lib/python3.10/dist-packages/numpy/core/include -I/usr/include/python3.10 -c fairseq/data/token_block_utils_fast.cpp -o build/temp.linux-x86_64-cpython-310/fairseq/data/token_block_utils_fast.o -std=c++11 -O3 -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -DTORCH_EXTENSION_NAME=token_block_utils_fast -D_GLIBCXX_USE_CXX11_ABI=0
INFO:root:x86_64-linux-gnu-g++ -shared -Wl,-O1 -Wl,-Bsymbolic-functions -Wl,-Bsymbolic-functions -g -fwrapv -O2 build/temp.linux-x86_64-cpython-310/fairseq/data/token_block_utils_fast.o -L/usr/lib/x86_64-linux-gnu -o build/lib.linux-x86_64-cpython-310/fairseq/data

GeneratorHubInterface(
  (models): ModuleList(
    (0): TransformerModel(
      (encoder): TransformerEncoderBase(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(44512, 1024, padding_idx=1)
        (embed_positions): SinusoidalPositionalEmbedding()
        (layers): ModuleList(
          (0-5): 6 x TransformerEncoderLayerBase(
            (self_attn): MultiheadAttention(
              (dropout_module): FairseqDropout()
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (dropout_module): FairseqDropout()
            (activation_dropout_module): FairseqDropout()
            (fc1): 

In [None]:
# Translate with beam search:
fr = en2fr.translate('My name is Mehwish!', beam=5)

# Manually tokenize:
en_toks = en2fr.tokenize('My name is Mehwish!')

# Manually apply BPE:
en_bpe = en2fr.apply_bpe(en_toks)

# Manually binarize:
en_bin = en2fr.binarize(en_bpe)

# Generate five translations with top-k sampling:
fr_bin = en2fr.generate(en_bin, beam=5, sampling=True, sampling_topk=20)

# Convert one of the samples to a string and detokenize
fr_sample = fr_bin[0]['tokens']
fr_bpe = en2fr.string(fr_sample)
fr_toks = en2fr.remove_bpe(fr_bpe)
fr = en2fr.detokenize(fr_toks)
print(fr)


INFO:fairseq.tasks.fairseq_task:can_reuse_epoch_itr = False
INFO:fairseq.tasks.fairseq_task:reuse_dataloader = True
INFO:fairseq.tasks.fairseq_task:rebuild_batches = False
INFO:fairseq.tasks.fairseq_task:creating new batches for epoch 1
INFO:fairseq.tasks.fairseq_task:can_reuse_epoch_itr = False
INFO:fairseq.tasks.fairseq_task:reuse_dataloader = True
INFO:fairseq.tasks.fairseq_task:rebuild_batches = False
INFO:fairseq.tasks.fairseq_task:creating new batches for epoch 1


Je m'appelle Mehwish !


## Exercises

In [None]:
# Your turn
#Task 1: Load the dataset
!wget https://www.statmt.org/europarl/v7/fr-en.tgz
!tar -xzf fr-en.tgz

def extract_sentences(file_path, num_sentences):
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for _ in range(num_sentences):
            sentence = file.readline()
            if not sentence:
                break
            sentences.append(sentence.strip())
    return sentences

english_file = 'europarl-v7.fr-en.en'
french_file = 'europarl-v7.fr-en.fr'

num_sentences = 15000
eng_sentences = extract_sentences(english_file, num_sentences)
fre_sentences = extract_sentences(french_file, num_sentences)

assert len(eng_sentences) == len(fre_sentences) == num_sentences

print("Anglais : ", eng_sentences[:3])
print("Français : ", fre_sentences[:3])

--2024-06-18 22:24:21--  https://www.statmt.org/europarl/v7/fr-en.tgz
Resolving www.statmt.org (www.statmt.org)... 129.215.32.28
Connecting to www.statmt.org (www.statmt.org)|129.215.32.28|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 202718517 (193M) [application/x-gzip]
Saving to: ‘fr-en.tgz’


2024-06-18 22:24:37 (12.5 MB/s) - ‘fr-en.tgz’ saved [202718517/202718517]

Anglais :  ['Resumption of the session', 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.', "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful."]
Français :  ['Reprise de la session', 'Je déclare reprise la session du Parlement européen qui avait été interrompue le vendredi 17 décembre dernier et 

In [8]:
#Task 2: Perform NMT - use this pre-trained model
from tqdm import tqdm

def translate_sentence(sentence, model):
    translation = model.translate(sentence)
    return translation

translations = []
for sentence in eng_sentences[:1000]:
  translation = translate_sentence(sentence, en2fr)
  translations.append(translation)


INFO:fairseq.tasks.fairseq_task:can_reuse_epoch_itr = False
INFO:fairseq.tasks.fairseq_task:reuse_dataloader = True
INFO:fairseq.tasks.fairseq_task:rebuild_batches = False
INFO:fairseq.tasks.fairseq_task:creating new batches for epoch 1
INFO:fairseq.tasks.fairseq_task:can_reuse_epoch_itr = False
INFO:fairseq.tasks.fairseq_task:reuse_dataloader = True
INFO:fairseq.tasks.fairseq_task:rebuild_batches = False
INFO:fairseq.tasks.fairseq_task:creating new batches for epoch 1
INFO:fairseq.tasks.fairseq_task:can_reuse_epoch_itr = False
INFO:fairseq.tasks.fairseq_task:reuse_dataloader = True
INFO:fairseq.tasks.fairseq_task:rebuild_batches = False
INFO:fairseq.tasks.fairseq_task:creating new batches for epoch 1
INFO:fairseq.tasks.fairseq_task:can_reuse_epoch_itr = False
INFO:fairseq.tasks.fairseq_task:reuse_dataloader = True
INFO:fairseq.tasks.fairseq_task:rebuild_batches = False
INFO:fairseq.tasks.fairseq_task:creating new batches for epoch 1
INFO:fairseq.tasks.fairseq_task:can_reuse_epoch_itr 

In [13]:
#Task 3: Define the evaluation metrics (BLEU) and evaluate
#You can copy the code you write from RNN_Machine_Translation.ipynb
import collections
import math
!pip install sacrebleu
import sacrebleu

def get_ngrams(segment, max_order):
  """Extracts all n-grams upto a given maximum order from an input segment.

  Args:
    segment: text segment from which n-grams will be extracted.
    max_order: maximum length in tokens of the n-grams returned by this
        methods.

  Returns:
    The Counter containing all n-grams upto max_order in segment
    with a count of how many times each n-gram occurred.
  """
  ngram_counts = collections.Counter()
  for order in range(1, max_order + 1):
    for i in range(0, len(segment) - order + 1):
      ngram = tuple(segment[i:i+order])
      ngram_counts[ngram] += 1
  return ngram_counts


def compute_bleu(reference_corpus, translation_corpus, max_order=4):
  """Computes BLEU score of translated segments against one or more references.

  Args:
    reference_corpus: list of lists of references for each translation. Each
        reference should be tokenized into a list of tokens.
    translation_corpus: list of translations to score. Each translation
        should be tokenized into a list of tokens.
    max_order: Maximum n-gram order to use when computing BLEU score.

  Returns:
    3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
    precisions and brevity penalty.
  """
  matches_by_order = [0] * max_order
  possible_matches_by_order = [0] * max_order
  reference_length = 0
  translation_length = 0
  for (references, translation) in zip(reference_corpus,
                                       translation_corpus):
    reference_length += min(len(r) for r in references)
    translation_length += len(translation)

    merged_ref_ngram_counts = collections.Counter()
    for reference in references:
      merged_ref_ngram_counts |= get_ngrams(reference, max_order)
    translation_ngram_counts = get_ngrams(translation, max_order)
    overlap = translation_ngram_counts & merged_ref_ngram_counts
    for ngram in overlap:
      matches_by_order[len(ngram)-1] += overlap[ngram]
    for order in range(1, max_order+1):
      possible_matches = len(translation) - order + 1
      if possible_matches > 0:
        possible_matches_by_order[order-1] += possible_matches

  precisions = [0] * max_order
  for i in range(0, max_order):
      if possible_matches_by_order[i] > 0:
        precisions[i] = (float(matches_by_order[i]) /
                         possible_matches_by_order[i])
      else:
        precisions[i] = 0.0

  if min(precisions) > 0:
    ## TO_DO: compute the geometric mean of all modified precision scores
      geo_mean = math.exp(sum(math.log(p) for p in precisions) / max_order)
  else:
      geo_mean = 0


  ## TO_DO: compute the brevity penalty (BP)
  ratio = translation_length / reference_length
  if ratio > 1.0:
      bp = 1.0
  else:
      bp = math.exp(1.0 - 1.0 / ratio)

  # final bleu score
  bleu = geo_mean * bp

  return (bleu, precisions, bp, ratio, translation_length, reference_length)

def compute_bleu_score(predictions, references, max_order=4):

    if isinstance(references[0], str):
        references = [[ref] for ref in references]

    # Calculer le score BLEU pour les prédictions
    bleu = sacrebleu.corpus_bleu(predictions, references)

    return {
        "bleu": bleu.score,
        "precisions": bleu.precisions,
        "brevity_penalty": bleu.bp,
        "length_ratio": bleu.sys_len / bleu.ref_len,
        "translation_length": bleu.sys_len,
        "reference_length": bleu.ref_len,
    }




In [14]:
compute_bleu_score(translations, fre_sentences[:1000], max_order=2)

{'bleu': 100.00000000000004,
 'precisions': [100.0, 100.0, 100.0, 100.0],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0,
 'translation_length': 4,
 'reference_length': 4}