# **Fine Tuning NMT Transformer**

## models names

In [1]:
# Helsinki-NLP/opus-mt-tc-big-en-ar
# Helsinki-NLP/opus-mt-tc-big-ar-en
# Helsinki-NLP/opus-mt-en-ar
# marefa-nlp/marefa-mt-ar-en

## Installing Dependencies

In [2]:
#! pip install --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host files.pythonhosted.org <package_name>

In [None]:
! pip install datasets sacrebleu torch transformers sentencepiece transformers[sentencepiece]

In [None]:
! pip install accelerate -U

## Required Imports

In [1]:
import warnings
import numpy as np
import pandas as pd

import torch
import transformers

from datasets import Dataset, load_metric

from tqdm import tqdm
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

warnings.filterwarnings("ignore")

## Constants

In [5]:
BATCH_SIZE = 16
BLEU = "bleu"
ENGLISH = "en"
ENGLISH_TEXT = "English"
EPOCH = "epoch"
INPUT_IDS = "input_ids"
FILENAME = "translation_train.csv"
GEN_LEN = "gen_len"
MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128
MODEL_CHECKPOINT = "Helsinki-NLP/opus-mt-ar-en"
MODEL_NAME = MODEL_CHECKPOINT.split("/")[-1]
LABELS = "labels"
PREFIX = ""
PORTUGUESE = "ar"
PORTUGUESE_TEXT = "Arabic"
SCORE = "score"
SOURCE_LANG = "ar"
TARGET_LANG = "en"
TRANSLATION = "translation"

## Helper Functions

In [6]:
def postprocess_text(preds: list, labels: list) -> tuple:
    """Performs post processing on the prediction text and labels"""

    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def prep_data_for_model_fine_tuning(source_lang: list, target_lang: list) -> list:
    """Takes the input data lists and converts into translation list of dicts"""

    data_dict = dict()
    data_dict[TRANSLATION] = []

    for sr_text, tr_text in zip(source_lang, target_lang):
        temp_dict = dict()
        temp_dict[PORTUGUESE] = sr_text
        temp_dict[ENGLISH] = tr_text

        data_dict[TRANSLATION].append(temp_dict)

    return data_dict


def generate_model_ready_dataset(dataset: list, source: str, target: str,
                                 model_checkpoint: str,
                                 tokenizer: AutoTokenizer):
    """Makes the data training ready for the model"""

    preped_data = []

    for row in dataset:
        inputs = PREFIX + row[source]
        targets = row[target]

        model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH,
                                 truncation=True, padding=True)

        model_inputs[TRANSLATION] = row

        # setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=MAX_INPUT_LENGTH,
                                 truncation=True, padding=True)
            model_inputs[LABELS] = labels[INPUT_IDS]

        preped_data.append(model_inputs)

    return preped_data



def compute_metrics(eval_preds: tuple) -> dict:
    """computes bleu score and other performance metrics """

    metric = load_metric("sacrebleu")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {BLEU: result[SCORE]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

    result[GEN_LEN] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}

    return result

## Loading and Preparing The Dataset

In [7]:
translation_data = pd.read_csv(FILENAME)
translation_data.head()

Unnamed: 0,English,Arabic
0,I have been dealt four aces.,لقد رميت اربع اوراق من نوع أس.
1,rejoice over the verdict.,ابتهاج نتيجة اصدار الحكم
2,today the people of south africa marched in su...,اليوم شعب جنوب افريقيا يدعم الفلسطينيين الذين ...
3,earlier this month the ministry of health decl...,وقد صرحت وزارة الصحة باكرا هذا الشهر ان اكثر م...
4,sharek posted this video titled sharek partici...,نشر هذا الفيديو تحت عنوان شارك داعيا الشعب للا...


## Train, Test & Validation Split of Data

In [8]:
X = translation_data[PORTUGUESE_TEXT]
y = translation_data[ENGLISH_TEXT]

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.10, shuffle=True, random_state=100)

print("INITIAL X-TRAIN SHAPE: ", x_train.shape)
print("INITIAL Y-TRAIN SHAPE: ", y_train.shape)
print("X-TEST SHAPE: ", x_test.shape)
print("Y-TEST SHAPE: ", y_test.shape)

INITIAL X-TRAIN SHAPE:  (21065,)
INITIAL Y-TRAIN SHAPE:  (21065,)
X-TEST SHAPE:  (2341,)
Y-TEST SHAPE:  (2341,)


In [10]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20, shuffle=True, random_state=100)

print("FINAL X-TRAIN SHAPE: ", x_train.shape)
print("FINAL Y-TRAIN SHAPE: ", y_train.shape)
print("X-VAL SHAPE: ", x_val.shape)
print("Y-VAL SHAPE: ", y_val.shape)

FINAL X-TRAIN SHAPE:  (16852,)
FINAL Y-TRAIN SHAPE:  (16852,)
X-VAL SHAPE:  (4213,)
Y-VAL SHAPE:  (4213,)


## Load Tokenizer from AutoTokenizer Class

In [11]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

## Prepare the model ready dataset

In [12]:
training_data = prep_data_for_model_fine_tuning(x_train.values, y_train.values)

validation_data = prep_data_for_model_fine_tuning(x_val.values, y_val.values)

test_data = prep_data_for_model_fine_tuning(x_test.values, y_test.values)

In [13]:
train_data = generate_model_ready_dataset(dataset=training_data[TRANSLATION],
                                          tokenizer=tokenizer,
                                          source=PORTUGUESE,
                                          target=ENGLISH,
                                          model_checkpoint=MODEL_CHECKPOINT)

validation_data = generate_model_ready_dataset(dataset=validation_data[TRANSLATION],
                                               tokenizer=tokenizer,
                                               source=PORTUGUESE,
                                               target=ENGLISH,
                                               model_checkpoint=MODEL_CHECKPOINT)

test_data = generate_model_ready_dataset(dataset=test_data[TRANSLATION],
                                               tokenizer=tokenizer,
                                               source=PORTUGUESE,
                                               target=ENGLISH,
                                               model_checkpoint=MODEL_CHECKPOINT)

In [14]:
train_df = pd.DataFrame.from_records(train_data)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16852 entries, 0 to 16851
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   attention_mask  16852 non-null  object
 1   input_ids       16852 non-null  object
 2   labels          16852 non-null  object
 3   translation     16852 non-null  object
dtypes: object(4)
memory usage: 526.8+ KB


In [15]:
validation_df = pd.DataFrame.from_records(validation_data)
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4213 entries, 0 to 4212
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   attention_mask  4213 non-null   object
 1   input_ids       4213 non-null   object
 2   labels          4213 non-null   object
 3   translation     4213 non-null   object
dtypes: object(4)
memory usage: 131.8+ KB


In [16]:
test_df = pd.DataFrame.from_records(test_data)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2341 entries, 0 to 2340
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   attention_mask  2341 non-null   object
 1   input_ids       2341 non-null   object
 2   labels          2341 non-null   object
 3   translation     2341 non-null   object
dtypes: object(4)
memory usage: 73.3+ KB


## Convert dataframe to Dataset Class object

In [17]:
train_dataset = Dataset.from_pandas(train_df)
train_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'translation'],
    num_rows: 16852
})

In [18]:
validation_dataset = Dataset.from_pandas(validation_df)
validation_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'translation'],
    num_rows: 4213
})

In [19]:
test_dataset = Dataset.from_pandas(test_df)
test_dataset

Dataset({
    features: ['attention_mask', 'input_ids', 'labels', 'translation'],
    num_rows: 2341
})

## Load model, Create Model Training Args and Data Collator

In [20]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)

In [21]:
model_args = Seq2SeqTrainingArguments(
    f"{MODEL_NAME}-finetuned-{SOURCE_LANG}-to-{TARGET_LANG}",
    evaluation_strategy=EPOCH,
    learning_rate=2e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.02,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True
)

In [22]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Fine Tuning the Model

In [23]:
trainer = Seq2SeqTrainer(
    model,
    model_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.3204,2.068144,27.0143,24.0071


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.3204,2.068144,27.0143,24.0071
2,1.5305,1.956574,28.7288,23.7885


TrainOutput(global_step=2108, training_loss=1.9815307197353644, metrics={'train_runtime': 1275.6535, 'train_samples_per_second': 26.421, 'train_steps_per_second': 1.652, 'total_flos': 738843465351168.0, 'train_loss': 1.9815307197353644, 'epoch': 2.0})

## Saving the Fine Tuned Transformer

In [25]:
trainer.save_model("FineTunedTransformer")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62833]], 'forced_eos_token_id': 0}


## Perform Translation on Test Datset

In [26]:
test_results = trainer.predict(test_dataset)

In [27]:
print("Test Bleu Score: ", test_results.metrics["test_bleu"])

Test Bleu Score:  30.1198


## Generate Prediction Sentences

In [28]:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62834, 512, padding_idx=62833)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62834, 512, padding_idx=62833)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [31]:
predictions = []
test_input = test_dataset[TRANSLATION]

for input_text in tqdm(test_input):
    source_sentence = input_text[PORTUGUESE]
    encoded_source = tokenizer(source_sentence,
                               return_tensors='pt',
                               padding=True,
                               truncation=True)
    encoded_source.to(device)  # Move input tensor to the same device as the model

    translated = model.generate(**encoded_source)

    predictions.append([tokenizer.decode(t, skip_special_tokens=True) for t in translated][0])

# Move the model back to CPU if needed
model.to("cpu")

100%|██████████| 2341/2341 [09:24<00:00,  4.15it/s]


MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62834, 512, padding_idx=62833)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62834, 512, padding_idx=62833)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [32]:
y_true_en = []
y_true_pt = []

for input_text in tqdm(test_input):
    y_true_pt.append(input_text[PORTUGUESE])
    y_true_en.append(input_text[ENGLISH])

100%|██████████| 2341/2341 [00:00<00:00, 663526.53it/s]


In [33]:
output_df = pd.DataFrame({"y_true_port": y_true_pt, "y_true_eng": y_true_en, "predicted_text": predictions})
output_df

Unnamed: 0,y_true_port,y_true_eng,predicted_text
0,هل ستغني؟,Are you going to sing?,Will you sing?
1,لماذا لايحب الالمان اليهود؟,why didn t germans like the jews.,why don t germans like jews.
2,يا لسوء حظي.,How unlucky I am!,How unfortunate I am.
3,البحث الجديد,new research.,new research.
4,السرير الذي نمت عليه بالأمس لم يكن مريحاً.,The bed I slept in last night wasn't very comf...,The bed you slept in yesterday was uncomfortable.
...,...,...,...
2336,وغرد فرشد فاريابي عن زيارة وزير خارجية السويد,farshad faryabi tweeted.,frahd farabi tweeted about the swedish foreign...
2337,أنا مازلت أحاول ان أعزف على الجيتار مثل إريك ك...,I'm still trying to play guitar like Eric Clap...,I'm still trying to play the guitar like eric ...
2338,العديد من المواطنين يدونون ويرسلون رسايل تويتر...,many citizens are blogging and tweeting about ...,many citizens blog and send twitter messages a...
2339,وبالرغم من ان الحكومة السابقة منحت تراخيص لشرك...,the previous government however granted licens...,although the previous government has granted l...


## Loading the stored Model and using it for translation

In [34]:
ft_model_tokenizer = AutoTokenizer.from_pretrained("FineTunedTransformer")
ft_model = AutoModelForSeq2SeqLM.from_pretrained("FineTunedTransformer")

In [42]:
ft_prediction = []

for sentence in tqdm(x_test):
    encoded_text = ft_model_tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    translated = ft_model.generate(**encoded_text)
    ft_prediction.append([tokenizer.decode(t, skip_special_tokens=True) for t in translated][0])

100%|██████████| 2341/2341 [48:13<00:00,  1.24s/it]


In [43]:
ft_prediction

['Will you sing?',
 'why don t germans like jews.',
 'How unfortunate I am.',
 'new research.',
 'The bed you slept in yesterday was uncomfortable.',
 'My goal is to be a doctor.',
 'lebanon a wedding global voices.',
 'You disappointed me in you.',
 "Let's ask her when she gets home.",
 'the decision has been made in the last few minutes.',
 'I have only one brother.',
 'this is just a quick glimpse of the situation in lebanon without the presidency next week we will have more.',
 'instead of grapes world yasu haley silassi makes wine from khat paper a subject that people chew widely in east africa and the middle east for drug effects.',
 "There's one apple on the desk.",
 'how to enjoy traffic global voices.',
 'united states and uae have set up a clothing center to counter ismas s online propaganda.',
 "Take care of what you're saying, Tom.",
 'the global voices citizen media summit is open for registration.',
 'trinidad tobago voting process global voices.',
 'How can you not say a

In [39]:
!zip -r FineTunedTransformer.zip /content/FineTunedTransformer

  adding: content/FineTunedTransformer/ (stored 0%)
  adding: content/FineTunedTransformer/special_tokens_map.json (deflated 35%)
  adding: content/FineTunedTransformer/target.spm (deflated 49%)
  adding: content/FineTunedTransformer/vocab.json (deflated 77%)
  adding: content/FineTunedTransformer/training_args.bin (deflated 51%)
  adding: content/FineTunedTransformer/config.json (deflated 61%)
  adding: content/FineTunedTransformer/tokenizer_config.json (deflated 68%)
  adding: content/FineTunedTransformer/source.spm (deflated 55%)
  adding: content/FineTunedTransformer/generation_config.json (deflated 43%)
  adding: content/FineTunedTransformer/model.safetensors (deflated 7%)
