In [1]:
from datasets import Dataset
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch

In [2]:
with open('NLLB.en-gn/NLLB.en-gn.en', 'r') as f:
    ENs = f.read().splitlines()
with open('NLLB.en-gn/NLLB.en-gn.gn', 'r') as f:
    GNs = f.read().splitlines()

In [3]:
ENs = ENs[:3000]
GNs = GNs[:3000]

In [4]:
# Define custom Guarani language code
new_lang_code = "gn_XX"

# Prepare data entries with language tokens
data = []
for en, gn in zip(ENs, GNs):
    data.append({
        "src": f"en_XX {en}",
        "tgt": f"{new_lang_code} {gn}"
    })
    

raw_dataset = Dataset.from_list(data)

In [5]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50")

# Add Guarani token and resize embeddings
tokenizer.add_special_tokens({'additional_special_tokens': [new_lang_code]})
model.resize_token_embeddings(len(tokenizer))

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


MBartScaledWordEmbedding(250055, 1024, padding_idx=1)

In [6]:
from datasets import Dataset
from transformers import AutoTokenizer

max_length = 128

def preprocess(example):
    
    inputs = tokenizer(example["src"], max_length=max_length, padding="max_length", truncation=True)
    
    labels = tokenizer(example["tgt"], max_length=max_length, padding="max_length", truncation=True)
        
    inputs["labels"] = labels["input_ids"]
    
    return inputs

# remove_columns=raw_dataset.column_names

tokenized_dataset = raw_dataset.map(preprocess, batched=False)


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [24]:
torch.mps.set_per_process_memory_fraction(0.0)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250055, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250055, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=

In [None]:
# data collector
forced_bos_token_id = tokenizer.convert_tokens_to_ids(new_lang_code)
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    pad_to_multiple_of=8
)

# training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart50-finetuned-gn",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-5,
    num_train_epochs=1,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_strategy="no",
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
    predict_with_generate=True,
    #generation_max_length=64,
    #generation_num_beams=4,
    #report_to="none",
    max_steps=500,
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model, 
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train
trainer.train()

  trainer = Seq2SeqTrainer(


Step,Training Loss
