In [1]:
import os
from datasets import load_dataset
import numpy as np
from transformers import MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer, MarianTokenizer, DataCollatorForSeq2Seq
import evaluate
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_dataset("json", data_files={"train": "train.json", "validation": "val.json"})

In [3]:
print("CUDA_VISIBLE_DEVICES:", os.environ.get("CUDA_VISIBLE_DEVICES"))
print(torch.cuda.is_available())  # Should be True if ROCm is set up
print(torch.cuda.device_count())  # Should show number of GPUs
print(torch.cuda.get_device_name(0))  # Should print your AMD GPU
print(torch.version.hip)

CUDA_VISIBLE_DEVICES: 0
True
1
AMD Radeon Graphics
6.4.43482-0f2d60242


In [4]:
model_checkpoint = "rajbhirud/eng-to-fra-model"
tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)

max_length = 512

def preprocess(batch):
    inputs = tokenizer(batch["en"], truncation=True, padding="max_length", max_length=max_length)
    targets = tokenizer(batch["fr"], truncation=True, padding="max_length", max_length=max_length)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_data = data.map(preprocess, batched=True)



In [5]:
model = MarianMTModel.from_pretrained(model_checkpoint)
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])

training_args = Seq2SeqTrainingArguments(
    output_dir="./eng-fra-finetuned_0.0001_2e-5",         # Directory where model checkpoints and final model will be saved.
                                               # Change this if you want to save outputs elsewhere or run multiple experiments.

    learning_rate=2e-5,                        # The initial learning rate for the AdamW optimizer.
                                               # Lower values (e.g., 1e-5) can lead to slower but potentially more stable training.
                                               # Higher values (e.g., 1e-4) may speed up training but risk overshooting minima.

    per_device_train_batch_size=16,            # Number of samples per batch on each device (GPU/CPU) during training.
                                               # Increase for faster training if you have enough memory, decrease if you get OOM errors.

    per_device_eval_batch_size=32,             # Number of samples per batch on each device during evaluation.
                                               # Can usually be set higher than training batch size if memory allows.

    weight_decay=0.0001,                       # L2 regularization coefficient; helps prevent overfitting. The larger the weights, the more the model tends to overfit to the training data, not generalize
                                               # Setting weight_decay=0.01 means 1% of each weight’s value is added to the gradient during backpropagation (before the learning rate is applied).
                                               # Increase to regularize more, decrease to regularize less.
                                               # Higher weight_decay: Stronger regularization, less risk of overfitting, but if set too high, the model may underfit (fail to learn enough from the data). 
                                               # Lower weight_decay (or zero): Weaker or no regularization, which can lead to overfitting, especially on small datasets.

    save_total_limit=1,                        # Maximum number of checkpoints to keep.
                                               # Older checkpoints are deleted. Increase to keep more history, decrease to save disk space.

    num_train_epochs=3,                        # Number of times to iterate over the entire training dataset.
                                               # Increase for more training (may improve results with enough data), decrease for faster runs.

    predict_with_generate=True,                # Use model.generate() for evaluation and prediction.
                                               # Should be True for seq2seq tasks (like translation) to generate output sequences.
                                               # Set to False if you only care about loss, not generated text.
    dataloader_num_workers=4,
    eval_steps=500
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    data_collator=data_collator,
    #tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss
500,0.793
1000,0.3966
1500,0.3478




TrainOutput(global_step=1524, training_loss=0.5099525313990636, metrics={'train_runtime': 1401.2412, 'train_samples_per_second': 17.402, 'train_steps_per_second': 1.088, 'total_flos': 3306310921617408.0, 'train_loss': 0.5099525313990636, 'epoch': 3.0})

In [None]:
# results = trainer.evaluate()
# print(f"BLEU score: {results}")

KeyboardInterrupt: 

In [None]:
import textwrap
input_text = "As part of Canada’s second Action Plan on Open Government, the Government of Canada has committed to the disclosure of contracting data via a centralized, machine-readable database available to the public. This commitment reinforces the proactive publication of contracts, which reflects broader government commitments to transparency and strengthened accountability within the public sector originally announced in Budget 2004."
reference_text = "Dans le cadre du deuxième Plan d’action national pour un gouvernement ouvert du Canada, le gouvernement du Canada s’est engagé à la divulgation des données sur la passation de marchés au moyen d’une base de données publique à la fois centralisée et lisible par machine. Cet engagement renforce la publication proactive des marchés, qui tient compte des engagements élargis du gouvernement envers la transparence et la responsabilisation accrue dans le secteur public, annoncés initialement dans le budget fédéral de 2004."
original_prediction = "Dans le cadre du deuxième Plan d'action du Canada pour un gouvernement ouvert, le gouvernement du Canada s'est engagé à communiquer des données sous-traitées via une base de données centralisée, lisible par la machine, accessible au public. Cet objectif renforce la publication proactive des marchés, qui reflète l'ensemble des promesses gouvernementales en matière de transparence et de redevabilité au sein du secteur public initialement annoncé dans le budget de 2004."
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length)
inputs = {key: value.to(model.device) for key, value in inputs.items()}  # Move inputs to the same device as the model
translated = model.generate(**inputs)
print("Input: ", input_text)
#print(textwrap.fill(input_text, width=120))
print("--------------------------")
print("Prediction (Post fine tuning): ", tokenizer.decode(translated[0], skip_special_tokens=True))
#print(textwrap.fill(tokenizer.decode(translated[0], skip_special_tokens=True), width=120))
print("--------------------------")
print("Reference: ", reference_text)
#print(textwrap.fill(reference_text, width=120))
print("--------------------------")
print("Original Prediction (Pre fine tuning): ", original_prediction)
#print(textwrap.fill(original_prediction, width=120))