# Training an EN_TO_AR Neural Machine Translation Model Using FLAN-UL2 

In [None]:
!pip install transformers datasets evaluate

In [61]:
!pip install sacrebleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 KB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Collecting portalocker
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.7.0 sacrebleu-2.3.1


In [19]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
base = "/content/drive/MyDrive/NLP-MT/"

In [36]:
import pandas as pd
import numpy as np 
from torch import nn
import torch

In [32]:
path = base + 'ara_eng.txt'
path_csv = base + 'ara_eng.csv'
# df = pd.read_csv(path, delimiter="\t", names=["eng","ar"])
# df.to_csv(path_csv, sep=',', index=False)

In [39]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files=path_csv)



  0%|          | 0/1 [00:00<?, ?it/s]

In [40]:
dataset

DatasetDict({
    train: Dataset({
        features: ['eng', 'ar'],
        num_rows: 24638
    })
})

In [41]:
dataset = dataset["train"].train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['eng', 'ar'],
        num_rows: 19710
    })
    test: Dataset({
        features: ['eng', 'ar'],
        num_rows: 4928
    })
})

In [47]:
dataset["train"][0]['eng']

'mexico was a good year global voices.'

In [42]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("t5-small")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [56]:
source_lang = "eng"
target_lang = "ar"
prefix = "translate English to Arabic: "


def preprocess_function(examples):
    inputs = [prefix + examples[source_lang][i] for i in range(len(examples[source_lang]))]
    targets = [examples[target_lang][i] for i in range(len(examples[target_lang]))]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

tokenized_data = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/19710 [00:00<?, ? examples/s]

Map:   0%|          | 0/4928 [00:00<?, ? examples/s]

In [58]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")


Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [59]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [62]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")

In [65]:
# computing sacrebleu score

import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
import os
training_args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(base,'checkpoints'),
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: ar, eng. If ar, eng are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 19710
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2464
  Number of trainable parameters = 60506624
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is f

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.1043,0.086726,0.0,14.461


Saving model checkpoint to /content/drive/MyDrive/NLP-MT/checkpoints/checkpoint-500
Configuration saved in /content/drive/MyDrive/NLP-MT/checkpoints/checkpoint-500/config.json
Configuration saved in /content/drive/MyDrive/NLP-MT/checkpoints/checkpoint-500/generation_config.json
Model weights saved in /content/drive/MyDrive/NLP-MT/checkpoints/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/NLP-MT/checkpoints/checkpoint-500/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/NLP-MT/checkpoints/checkpoint-500/special_tokens_map.json
Saving model checkpoint to /content/drive/MyDrive/NLP-MT/checkpoints/checkpoint-1000
Configuration saved in /content/drive/MyDrive/NLP-MT/checkpoints/checkpoint-1000/config.json
Configuration saved in /content/drive/MyDrive/NLP-MT/checkpoints/checkpoint-1000/generation_config.json
Model weights saved in /content/drive/MyDrive/NLP-MT/checkpoints/checkpoint-1000/pytorch_model.bin
tokenizer config file 

## Inference

In [None]:
from transformers import pipeline

text = text = "translate English to Arabic: I don't think this is good."
translator = pipeline("translation", model=base+"checkpoints")
translator(text)