In [1]:
!pip install transformers torch



In [2]:
import torch
from transformers import MBartForConditionalGeneration, MBartTokenizer

In [3]:
model_name = 'facebook/mbart-large-50-many-to-many-mmt'
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [4]:
text = "Hello, how are you?"

inputs = tokenizer(text, return_tensors='pt')
print(inputs)

{'input_ids': tensor([[ 35378,      4,   3642,    621,    398,     32,      2, 250004]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


In [5]:
src_lang = 'en_XX'
tgt_lang = 'fr_XX'

forced_bos_token_id = tokenizer.lang_code_to_id[tgt_lang]
print("Target language ID : ", forced_bos_token_id)

Target language ID :  250008


In [6]:
output = model.generate(**inputs, forced_bos_token_id = forced_bos_token_id)
print("Generated token IDs :", output)

Generated token IDs : tensor([[     2, 250008,  84602,      4,   6868,    307,      9,     18,      9,
            379,     32,      2]])


In [7]:
translated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Translated text : ", translated_text)

Translated text :  Bonjour, comment va-t-il?


## Fine-tuning on OPUS books dataset

In [8]:
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

In [9]:
dataset = load_dataset('Helsinki-NLP/opus_books', 'en-fr')
dataset

README.md:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/127085 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 127085
    })
})

In [10]:
dataset = dataset['train'].select(range(1000))
dataset

Dataset({
    features: ['id', 'translation'],
    num_rows: 1000
})

In [11]:
split_dataset = dataset.train_test_split(test_size=0.2, seed = 42)
print(split_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 800
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 200
    })
})


In [12]:
split_dataset['train'][0]

{'id': '911',
 'translation': {'en': 'The old lady, shaky and worn with age, never ceased chatting and laughing.',
  'fr': 'La vieille dame, cassée, tremblante, ne cessait de causer gaiement et de rire.'}}

In [13]:
def preprocess_data(examples):
    inputs = [ex['en'] for ex in examples['translation']]
    targets = [ex['fr'] for ex in examples['translation']]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding = "max_length")
    labels = tokenizer(targets, max_length = 128, truncation =True, padding = "max_length")

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = split_dataset.map(preprocess_data, batched=True)


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 800
    })
    test: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 200
    })
})

In [15]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

In [16]:
training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    # logging_dir="/kaggle/working/logs/",
    logging_steps=20,
    report_to="tensorboard",
    save_total_limit=1
    # push_to_hub=False,  # Set to True if you want to push to Hugging Face Hub
)



In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.5049,0.432628
2,0.2753,0.4202
3,0.2034,0.437498




TrainOutput(global_step=600, training_loss=0.997893956899643, metrics={'train_runtime': 370.5236, 'train_samples_per_second': 6.477, 'train_steps_per_second': 1.619, 'total_flos': 650138930380800.0, 'train_loss': 0.997893956899643, 'epoch': 3.0})

In [18]:
trainer.save_model("/kaggle/working/mbart-finetuned-en-fr")

In [20]:
import torch

# Ensure model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Test Translation
text = "This is an amazing book."
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Move input tensors to the same device as the model
inputs = {key: value.to(device) for key, value in inputs.items()}

# Set the target language
forced_bos_token_id = tokenizer.lang_code_to_id[tgt_lang]

# Generate translation
output = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id)
translated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Translated Text:", translated_text)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Translated Text: C’est un livre extraordinaire.
