In [None]:
from datasets import Dataset
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch

In [None]:
#Open English and Guarani NLLB corpus

with open('/content/drive/MyDrive/NLLB/NLLB.en-gn.en', 'r') as f: # English
    ENs = f.read().splitlines()
with open('/content/drive/MyDrive/NLLB/NLLB.en-gn.gn', 'r') as f: # Guarani
    GNs = f.read().splitlines()

#Can be changed depending on processing capabilities
ENs = ENs[:10000]
GNs = GNs[:10000]

In [None]:
# Create a format that the model can process

data = [{"translation": {"en": en, "gn": gn}} for en, gn in zip(ENs, GNs)]
raw_dataset = Dataset.from_list(data)

In [None]:
# Set model and tokenizer

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50")

tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "gn_XX"

In [None]:
# map dataset
def preprocess(examples):

    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["gn"] for ex in examples["translation"]]

    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

tokenized_dataset = raw_dataset.map(preprocess, batched=True)


In [None]:

# training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./mbart50-gn-model",
    eval_strategy="no",
    per_device_train_batch_size=8,
    learning_rate=3e-5,
    num_train_epochs=3,
    save_strategy="epoch",
    logging_dir="./logs",
    fp16=torch.cuda.is_available(),
    predict_with_generate=True,

)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train
trainer.train()

In [None]:
model.config.forced_bos_token_id = tokenizer.lang_code_to_id["gn_XX"]

In [None]:
# Individual test for if it produces anything

def translate(text, max_length=64):
    tokenizer.src_lang = "en_XX"
    encoded = tokenizer(text, return_tensors="pt").to(model.device)
    generated = model.generate(**encoded, forced_bos_token_id=tokenizer.lang_code_to_id["gn_XX"], max_length=max_length)
    return tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

#Test it out
translate("Where are you going?")

In [None]:
# Past this point is chrF++ testing

In [None]:
import evaluate
chrf = evaluate.load("chrf")

In [None]:
# translation widget for extra testing ###Developed by another team member###
import ipywidgets as widgets
from IPython.display import display
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch

# === 1. Load model and tokenizer from saved directory ===


tokenizer = MBart50TokenizerFast.from_pretrained(model_dir)
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "gn_XX"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MBartForConditionalGeneration.from_pretrained(model_dir).to(device)

def translate(text, max_new_tokens=64):
    inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        output_tokens = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id["gn_XX"],
            max_new_tokens=max_new_tokens,
            num_beams=4,
            do_sample=False
        )

    return tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]

# UI Elements
text_input = widgets.Text(
    value='Where are you going?',
    placeholder='Type a sentence...',
    description='English:',
    layout=widgets.Layout(width='90%')
)

output_box = widgets.Output()
translate_button = widgets.Button(
    description='Translate to Guarani',
    button_style='success'
)

def on_translate_clicked(b):
    output_box.clear_output()
    sentence = text_input.value.strip()
    with output_box:
        if not sentence:
            print("Please enter a sentence.")
        else:
            translation = translate(sentence)
            print("Guarani:", translation)

translate_button.on_click(on_translate_clicked)

display(widgets.VBox([
    text_input,
    translate_button,
    output_box
]))


In [None]:
# read in test corpus Flores-200
with open('Flores/eng_Latn.dev', 'r') as f: # English
    ENGs = f.read().splitlines()
with open('Flores/grn_Latn.dev', 'r') as f: # Guarani
    GRNs = f.read().splitlines()

In [None]:
def translate_all(source): #Translate a list of sentences

    complete_translation=[]
    for sent in source:
        complete_translation.append(translate(sent))


In [None]:
# Make a set of predictions in the form of a list
# Make a set of references in the form of a list of lists
predictions=translate_all(ENGs)
references=[[sent] for sent in GRNs]

In [None]:
# Find chrf++ score
results = chrf.compute(predictions=predictions, references=references, word_order=2)
print(results)