In [None]:
from datasets import load_dataset
data = load_dataset("json", data_files={"train": "train.json", "validation": "val.json"})
print(data["train"][0])

In [None]:
from transformers import MarianTokenizer

model_checkpoint = "rajbhirud/eng-to-fra-model"
tokenizer = MarianTokenizer.from_pretrained(model_checkpoint)

max_length = 128

def preprocess(batch):
    inputs = tokenizer(batch["en"], truncation=True, padding="max_length", max_length=max_length)
    targets = tokenizer(batch["fr"], truncation=True, padding="max_length", max_length=max_length)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_data = data.map(preprocess, batched=True)

In [None]:
import numpy as np
from transformers import MarianMTModel, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate

model = MarianMTModel.from_pretrained(model_checkpoint)
metric = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])

training_args = Seq2SeqTrainingArguments(
    output_dir="./eng-fra-finetuned",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
results = trainer.evaluate()
print(f"BLEU score: {results}")