### Packages to Install

In [None]:
!pip install transformers --quiet
!pip install datasets --quiet
!pip install sentencepiece --quiet
!pip install sacrebleu --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

### Imports

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import MBartForConditionalGeneration, MBartTokenizer, MBart50TokenizerFast
from datasets import load_dataset
from sacrebleu.metrics import BLEU, CHRF, TER
import pandas as pd
import nltk

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Model Pipeline

- get translated data from M2M-100, ChatGPT, GPT3/2, LibreTranslate
- add training data to mBART and train it
  - feed both english to swahili and sw to en?
- evaluate on FLORES swahili dataset


## Model training

In [None]:
# english to swahili tokenizer
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50")
en_sw_tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50", src_lang = 'en_XX', tgt_lang='sw_KE') 

base_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
trained_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

### Checking model final layer weights before and after training

In [None]:
base_weights = base_model.model.encoder.layers[-1].final_layer_norm.weight
base_weights

Parameter containing:
tensor([0.5078, 0.5190, 0.5303,  ..., 0.4971, 0.5322, 0.4968],
       requires_grad=True)

In [None]:
trained_weights = trained_model.model.encoder.layers[-1].final_layer_norm.weight
trained_weights

Parameter containing:
tensor([0.5078, 0.5190, 0.5303,  ..., 0.4971, 0.5322, 0.4968],
       requires_grad=True)

In [None]:
src_text = "Soko letu pia huuza vyakula kama samaki viazi soda na kadhalika"
tgt_text =  "Our market also sell foods like fish potato soda and so on"

model_inputs = en_sw_tokenizer(src_text, return_tensors="pt")
with en_sw_tokenizer.as_target_tokenizer():
    labels = en_sw_tokenizer(tgt_text, return_tensors="pt").input_ids

trained_model(**model_inputs, labels=labels) # forward pass

Seq2SeqLMOutput(loss=tensor(13.1764, grad_fn=<NllLossBackward0>), logits=tensor([[[59.1219, -1.4567, 36.8504,  ...,  5.7102, -0.9631, 15.0751],
         [11.0984, -0.1581, 20.5313,  ...,  0.4556, -0.1175,  7.7549],
         [ 9.3955, -0.3197, 26.1438,  ...,  1.6232,  2.4265, 12.0634],
         ...,
         [31.4119, -0.5005, 45.3707,  ...,  3.0150,  2.1092, 20.0592],
         [34.7658, -0.6142, 46.8380,  ...,  1.5435,  4.9164, 20.6402],
         [38.9224, -0.5262, 49.4893,  ...,  2.4021,  5.2619, 22.6424]]],
       grad_fn=<AddBackward0>), past_key_values=None, decoder_hidden_states=None, decoder_attentions=None, cross_attentions=None, encoder_last_hidden_state=tensor([[[ 9.8109e-03,  2.1326e-03, -2.3268e-02,  ..., -5.6636e-02,
          -1.0402e-02,  7.6869e-03],
         [-1.0054e+00,  4.1868e-01, -1.9250e+00,  ..., -5.0504e-01,
           4.5150e-01,  3.6374e-01],
         [-6.1976e-01, -3.2688e-01, -1.2343e+00,  ...,  7.7926e-01,
          -8.9562e-01,  5.3846e-01],
         ...,


In [None]:
def train_model(src_texts, tgt_texts, model, tokenizer):

  # maybe do batch sizes of 8 to save training time?

  for i in range(len(src_texts)):


    model_inputs = tokenizer(src_texts[i], return_tensors="pt")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(tgt_texts[i], return_tensors="pt").input_ids

    trained_model(**model_inputs, labels=labels) # forward pass
  
  return trained_model

In [None]:
# train model on new data

# training on 11k sentence pairs took 
data_path = '/content/drive/MyDrive/swahili_to_english - swahili_sentences.csv'
data = pd.read_csv(data_path)
src_texts = data['english'].values[:100]
tgt_texts = data['swahili'].values[:100]

trained_model = train_model(src_texts, tgt_texts, trained_model, en_sw_tokenizer)
trained_weights = trained_model.model.encoder.layers[-1].final_layer_norm.weight
trained_weights


Parameter containing:
tensor([0.5078, 0.5190, 0.5303,  ..., 0.4971, 0.5322, 0.4968],
       requires_grad=True)

In [None]:
base_weights = base_model.model.encoder.layers[-1].final_layer_norm.weight
base_weights

Parameter containing:
tensor([0.5078, 0.5190, 0.5303,  ..., 0.4971, 0.5322, 0.4968],
       requires_grad=True)

In [None]:
trained_model.save_pretrained('/content/drive/MyDrive/saved_swahili_model')

## Model evaluation

In [None]:
sw_test_flores = load_dataset("facebook/flores", 'swh_Latn')
en_test_flores = load_dataset("facebook/flores", 'eng_Latn')

Downloading builder script:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/11.7k [00:00<?, ?B/s]

Downloading and preparing dataset flores/swh_Latn to /root/.cache/huggingface/datasets/facebook___flores/swh_Latn/1.0.0/2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef...


Downloading data:   0%|          | 0.00/25.6M [00:00<?, ?B/s]

Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Dataset flores downloaded and prepared to /root/.cache/huggingface/datasets/facebook___flores/swh_Latn/1.0.0/2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading and preparing dataset flores/eng_Latn to /root/.cache/huggingface/datasets/facebook___flores/eng_Latn/1.0.0/2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef...


Generating dev split: 0 examples [00:00, ? examples/s]

Generating devtest split: 0 examples [00:00, ? examples/s]

Dataset flores downloaded and prepared to /root/.cache/huggingface/datasets/facebook___flores/eng_Latn/1.0.0/2a1174c8c4991ca09a9cb5b9a367cb2e049b073852cb4097456164d4612391ef. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# sw_test_flores['dev'][0:5]['sentence']

In [None]:
def evaluate_model(src_texts, tgt_texts, model, tokenizer):
  # use model to predict based on flores english dataset and output swahili
  # compare predicted against labeled flores swahili dataset
  
  bleu = BLEU()
  predicted = []
  nltk_bleu_scores = []
  sacre_bleu_socres = []
  for i in range(len(src_texts)):
    print('source text: ', src_texts[i])
    tokenizer.src_lang = 'en_XX'
    inputs = tokenizer(src_texts[i], return_tensors="pt")
    translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["sw_KE"])
    output = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    print('translated text: ', output)
    predicted.append(output)

    # BLEU scores
    # sacre_bleu = bleu.corpus_score(output.split(), [tgt_texts[i].split()])
    # sacre_bleu_socres.append(sacre_bleu)
    # print(output.split(), [tgt_texts[i].split()])
    # print(sacre_bleu)

    BLEUscore = nltk.translate.bleu_score.sentence_bleu([tgt_texts[i].split()], output.split())
    nltk_bleu_scores.append(BLEUscore)
    print([tgt_texts[i].split()], output.split())
    print(BLEUscore)
    
  # returns BLEU scores
  return nltk_bleu_scores

In [None]:
# evaluate base model on test data

src_texts = en_test_flores['dev'][:1]['sentence']
tgt_texts = sw_test_flores['dev'][:1]['sentence']

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
nltk_bleu_scores = evaluate_model(src_texts, tgt_texts, model, tokenizer)
nltk_bleu_scores

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

source text:  On Monday, scientists from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly about one U.S. cent each.




translated text:  tori from the Stanford University School of Medicine announced the invention of a new diagnostic tool that can sort cells by type: a tiny printable chip that can be manufactured using standard inkjet printers for possibly for about one U.S.
[['Mnamo', 'Jumatatu,', 'wanasayansi', 'kutoka', 'Shule', 'ya', 'Tiba', 'ya', 'Chuo', 'Kikuu', 'cha', 'Stanford', 'walitangaza', 'uvumbuzi', 'wa', 'kifaa', 'kipya', 'cha', 'utambuzi', 'ambacho', 'kinaweza', 'kupanga', 'seli', 'kwa', 'aina:', 'kidude', 'kidogo', 'kinachoweza', 'kuchapwa,', 'na', 'ambacho', 'kinaweza', 'kutengenezwa', 'kwa', 'kutumia', 'printa', 'ya', 'kawaida', 'ya', 'kupuliza', 'rangi,', 'yawezekana', 'kwa', 'takribani', 'senti', 'moja', 'ya', 'Marekani', 'kwa', 'kila', 'moja.']] ['tori', 'from', 'the', 'Stanford', 'University', 'School', 'of', 'Medicine', 'announced', 'the', 'invention', 'of', 'a', 'new', 'diagnostic', 'tool', 'that', 'can', 'sort', 'cells', 'by', 'type:', 'a', 'tiny', 'printable', 'chip', 'that',

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


[5.502529039374988e-232]

In [None]:
tokenizer

MBart50TokenizerFast(name_or_path='facebook/mbart-large-50', vocab_size=250054, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>', 'additional_special_tokens': ['ar_AR', 'cs_CZ', 'de_DE', 'en_XX', 'es_XX', 'et_EE', 'fi_FI', 'fr_XX', 'gu_IN', 'hi_IN', 'it_IT', 'ja_XX', 'kk_KZ', 'ko_KR', 'lt_LT', 'lv_LV', 'my_MM', 'ne_NP', 'nl_XX', 'ro_RO', 'ru_RU', 'si_LK', 'tr_TR', 'vi_VN', 'zh_CN', 'af_ZA', 'az_AZ', 'bn_IN', 'fa_IR', 'he_IL', 'hr_HR', 'id_ID', 'ka_GE', 'km_KH', 'mk_MK', 'ml_IN', 'mn_MN', 'mr_IN', 'pl_PL', 'ps_AF', 'pt_XX', 'sv_SE', 'sw_KE', 'ta_IN', 'te_IN', 'th_TH', 'tl_XX', 'uk_UA', 'ur_PK', 'xh_ZA', 'gl_ES', 'sl_SI']})

In [None]:
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX")
article = "UN Chief Says There Is No Military Solution in Syria"
inputs = tokenizer(article, return_tensors="pt")
translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]

In [None]:
article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"
article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# translate Hindi to French
tokenizer.src_lang = "hi_IN"
encoded_hi = tokenizer(article_hi, return_tensors="pt")
generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"])
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire en Syria."

# translate Arabic to English
tokenizer.src_lang = "ar_AR"
encoded_ar = tokenizer(article_ar, return_tensors="pt")
generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "The Secretary-General of the United Nations says there is no military solution in Syria."

# Loading in M2M-100 https://huggingface.co/facebook/m2m100_418M

In [None]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

hi_text = "जीवन एक चॉकलेट बॉक्स की तरह है।"
chinese_text = "生活就像一盒巧克力。"

model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

# translate Hindi to French
tokenizer.src_lang = "hi"
encoded_hi = tokenizer(hi_text, return_tensors="pt")
generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("fr"))
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "La vie est comme une boîte de chocolat."

# translate Chinese to English
tokenizer.src_lang = "zh"
encoded_zh = tokenizer(chinese_text, return_tensors="pt")
generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
# => "Life is like a box of chocolate."


Downloading (…)lve/main/config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

Downloading (…)ncepiece.bpe.model";:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]



['Life is like a box of chocolate.']

## CC Matrix

In [None]:
cc_matrix = load_dataset("yhavinga/ccmatrix", 'en-zh') # after 17 minutes I stopped it

In [None]:
load in a language pair that has low BLEU scores, doesn't have to involve English
Armenian <-> Romanian
cc_matrix_ar_ro = load_dataset("yhavinga/ccmatrix", 'ar-ro') # took 7 minutes to load in 5M rows

In [None]:
cc_aligned = load_dataset("ccaligned_multilingual", 'en-zh')

# Baseline Model on English <-> Chinese

In [None]:
article_en = "I wonder how good is the quality of this machine translation model"
article_zh_ref = '我想知道这个翻译模型的质量有多好'

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# translate English to Chinese
tokenizer.src_lang = "en_XX"
encoded_hi = tokenizer(article_en, return_tensors="pt")
generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"])
decoded_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
decoded_text

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]



['我想知道这台机器翻译模型的质量有多好']

In [None]:
article_zh = "今天的天气真好，我希望这个周末不会下雨"
article_en_ref = "Today's weather is great, I hope it doesn't rain this weekend."

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# translate English to Chinese
tokenizer.src_lang = "zh_CN"
encoded_hi = tokenizer(article_zh, return_tensors="pt")
generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
english_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
english_text

["It's really nice today, and I hope it won't rain this weekend"]

In [None]:
bleu = BLEU()
bleu.corpus_score(english_text[0].split(), [article_en_ref.split()])

BLEU = 0.00 8.3/50.0/0.0/0.0 (BP = 0.920 ratio = 0.923 hyp_len = 12 ref_len = 13)

In [None]:
# BLEU doesn't work on chinese?
bleu = BLEU()
refs = [article_zh_ref.split()]
bleu.corpus_score(decoded_text[0].split(), refs)

# bleu.corpus_score(decoded_text[0].split(), [article_zh_ref.split()])

BLEU = 0.00 0.0/0.0/0.0/0.0 (BP = 1.000 ratio = 1.000 hyp_len = 1 ref_len = 1)


# Baseline Model on English <-> Russian

In [None]:
article_ru = "Сегодня отличная погода, я надеюсь, в выходные не будет дождя."
article_en_ref = "Today's weather is great, I hope it doesn't rain this weekend."

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# translate English to Russian
tokenizer.src_lang = "en_XX"
encoded_hi = tokenizer(article_en_ref, return_tensors="pt")
generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["ru_RU"])
decoded_russian = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
decoded_russian

['Сегодняшняя погода очень хороша, я надеюсь, что не будет дождя на выходные.']

In [None]:
# translate Russian to English
tokenizer.src_lang = "ru_RU"
encoded_hi = tokenizer(article_ru, return_tensors="pt")
generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
decoded_english = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
decoded_english

['Today is excellent weather, I hope there will be no rain on weekends.']

In [None]:
bleu = BLEU()
bleu.corpus_score(decoded_english[0].split(), [article_en_ref.split()])

BLEU = 0.00 25.0/50.0/0.0/0.0 (BP = 0.920 ratio = 0.923 hyp_len = 12 ref_len = 13)

In [None]:
bleu = BLEU()
bleu.corpus_score(decoded_russian[0].split(), [article_ru.split()])

BLEU = 0.00 25.0/25.0/0.0/0.0 (BP = 0.920 ratio = 0.923 hyp_len = 12 ref_len = 13)

# Baseline Model on Swahili <-> English

In [None]:
article_sw = "Hali ya hewa ya leo ni nzuri, natumai mvua haitanyesha wikendi hii."
article_en_ref = "Today's weather is great, I hope it doesn't rain this weekend."

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

# translate Swahili to English
tokenizer.src_lang = "sw_KE"
encoded_hi = tokenizer(article_sw, return_tensors="pt")
generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
decoded_english = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
decoded_english

In [None]:
bleu = BLEU()
bleu.corpus_score(decoded_english[0].split(), [article_en_ref.split()])