# Introduction

In [None]:
model_checkpoint = "Helsinki-NLP/opus-mt-ar-en"

In [None]:
from transformers import pipeline

translator = pipeline('translation', model=model_checkpoint)

In [None]:
translator('أنت شخصية توكسيك')

In [None]:
translator('أنا مبفكرش في عيالي؟')

In [None]:
translator('‫عشان خاطري.‬')

In [None]:
translator('يا علا!')

In [None]:
translator('طب وأنتي كويسه؟')

# Data Loading

In [None]:
from datasets import load_dataset

raw_datasets = load_dataset("HeshamHaroon/ArzEn-MultiGenre")

In [None]:
raw_datasets

# Data Preparing

In [None]:
split_datasets = raw_datasets['train'].train_test_split(train_size=0.9, seed=20)
split_datasets

In [None]:
split_datasets['validation'] = split_datasets.pop('test')
split_datasets

In [None]:
split_datasets['train'][1]

## Tokenizer Loading

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors='pt')

In [None]:
eg_sentence = split_datasets['train']['EGY'][0]
en_sentence = split_datasets['train']['ENG'][0]

inputs = tokenizer(eg_sentence, text_target=en_sentence)
inputs

In [None]:
split_datasets["train"][0]

In [None]:
import re

def clean_text(text):
    text = str(text).strip()
    text = text.replace('\u200f', '')  # remove RTL marks
    text = re.sub(r'\s+', ' ', text)
    return text

In [None]:
max_length = 128

def preprocess_function(examples):
    # Ensure that each example is a list of strings
    inputs = [clean_text(text) for text in examples["EGY"]]
    targets = [clean_text(text) for text in examples["ENG"]]

    model_inputs = tokenizer(
        inputs,
        text_target=targets,
        max_length=max_length,
        truncation=True
    )
    return model_inputs


In [None]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)

In [None]:
tokenized_datasets

## Data Collation

##### DataCollatorWithPadding only pads the inputs, so we need to pad the labels and the pad token for labels is -100 to make sure those padded values are ignored in the loss computation.

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])

In [None]:
batch["labels"]

# **Fine-tuning the model - Trainer API**

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
!pip install evaluate sacrebleu

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")

In [None]:
# Example reference and predictions
predictions = [
    "The cat is on the mat",
    "There is a cat sitting on the floor"
]

references = [
    ["The cat is on the mat"],  # each reference list can contain multiple valid translations
    ["A cat is sitting on the floor"]
]

res = metric.compute(predictions=predictions, references=references)
res

In [None]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

## Hyperparameters Search with optuna

In [None]:
# !pip install optuna

In [None]:
def model_init():
    return AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
import optuna

def hp_space_optuna(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True),
        "warmup_ratio": trial.suggest_float("warmup_ratio", 0.05, 0.2),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 6),
        "lr_scheduler_type": trial.suggest_categorical(
            "lr_scheduler_type", ["linear", "cosine", "polynomial"]
        ),
    }

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args  = Seq2SeqTrainingArguments(
    f'marian-finetuned-ArzEn-MultiGenre-egy-to-en',
    eval_strategy='epoch',
    save_strategy='no',
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,
    report_to='none',
    load_best_model_at_end=False,
)

In [None]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model_init=model_init,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
best_run = trainer.hyperparameter_search(
    direction="maximize",  # we want highest BLEU
    hp_space=hp_space_optuna,
    compute_objective=lambda metrics: metrics["eval_bleu"],
    n_trials=10,  # try 10 different combinations
)
print("Best run:", best_run)

In [None]:
best_args = best_run.hyperparameters
best_args

In [None]:
from transformers import Seq2SeqTrainingArguments

final_args = Seq2SeqTrainingArguments(
    f'marian-finetuned-ArzEn-MultiGenre-egy-to-en',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=0.00013501528621462957,
    lr_scheduler_type='polynomial',
    warmup_ratio=0.07805866417161107,
    num_train_epochs=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=True,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
)

In [None]:
from transformers import Seq2SeqTrainer

final_trainer = Seq2SeqTrainer(
    model,
    args=final_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



## Before fine-tuning

In [None]:
final_trainer.evaluate(max_length=max_length)

## Train the Model

In [None]:
final_trainer.train()

## After fine-tuning

In [None]:
trainer.evaluate(max_length=max_length)

## Push the model into HuggingFace Hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
final_trainer.push_to_hub(commit_message="Improved BLEU to 23.0 with 10 trials by optuna")


# Try the model

In [None]:
from transformers import pipeline

model_checkpoint = 'NEldin10/marian-finetuned-ArzEn-MultiGenre-egy-to-en'
translator = pipeline("translation", model=model_checkpoint)

In [None]:
translator('أنت شخصية توكسيك')

In [None]:
translator('أنا مبفكرش في عيالي؟')

In [None]:
translator('‫عشان خاطري.‬')

In [None]:
translator('يا علا!')

In [None]:
translator('طب وأنتي كويسه؟')

In [None]:
translator('إيه يا سليم، بتدور على إيه يا حبيبي؟‬')