In [None]:
'''
Commented out lines must be uncommented if running in a Colab environment
'''

In [None]:
!pip install -U transformers datasets evaluate accelerate sacrebleu --quiet

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
import torch
import os
import datasets
import evaluate
import tqdm
from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import SGD

In [None]:
def load_parallel_text_data(en_path, gn_path):
    with open(en_path, encoding="utf-8") as f_en:
        en_lines = [line.strip() for line in f_en if line.strip()]
    with open(gn_path, encoding="utf-8") as f_gn:
        gn_lines = [line.strip() for line in f_gn if line.strip()]
    if len(en_lines) != len(gn_lines):
        raise ValueError(f"Line mismatch: {len(en_lines)} English vs {len(gn_lines)} Guarani.")
    return datasets.Dataset.from_list([{"en": en, "gn": gn} for en, gn in zip(en_lines, gn_lines)])

def load_flores_for_bleu(english_path, guarani_path):
    with open(english_path, encoding="utf-8") as f_en:
        en_lines = [line.strip() for line in f_en if line.strip()]
    with open(guarani_path, encoding="utf-8") as f_gn:
        gn_lines = [line.strip() for line in f_gn if line.strip()]
    if len(en_lines) != len(gn_lines):
        raise ValueError(f"Line mismatch: {len(en_lines)} English vs {len(gn_lines)} Guarani.")
    data = [{"en": en, "gn": gn} for en, gn in zip(en_lines, gn_lines)]
    return datasets.Dataset.from_list(data)

#base_path = "/content/drive/MyDrive/BYT5 Files"
'''
If in colab environment, uncomment and add base_path to the file paths
'''
train_en_path = "NLLB.en-gn.en"
train_gn_path = "NLLB.en-gn.gn"
en_devtest_path = "eng_Latn.devtest"
gn_devtest_path = "grn_Latn.devtest"

train_dataset = load_parallel_text_data(train_en_path, train_gn_path)
test_dataset = load_flores_for_bleu(en_devtest_path, gn_devtest_path)

train_dataset = train_dataset.shuffle().select(range(10000))
test_dataset = test_dataset.select(range(500))


In [None]:
train_dataset[:5]

In [None]:
test_dataset[:5]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")

In [None]:
def preprocess_function(examples):
    inputs = [f"translate English to Guarani: {ex}" for ex in examples["en"]]
    targets = examples["gn"]

    model_inputs = tokenizer(
        inputs, max_length=128, padding="max_length", truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=128, padding="max_length", truncation=True
        )

    labels["input_ids"] = [
        [token if token != tokenizer.pad_token_id else -100 for token in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["en", "gn"])
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=["en", "gn"])

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100
)
chrf = evaluate.load("chrf")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return chrf.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])



In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/byt5-gn-output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    predict_with_generate=True,
    save_strategy="epoch",
    save_total_limit=1,
    report_to=[]
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/byt5-gn-output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    predict_with_generate=True,
    report_to=[]  # disables wandb/tensorboard
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator
)

model.to(training_args.device)

In [None]:
trainer.train()

In [None]:
trainer.save_model("/content/drive/MyDrive/BYT5 Files")
tokenizer.save_pretrained("/content/drive/MyDrive/BYT5 Files")

In [None]:
import ipywidgets as widgets
from IPython.display import display
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
import torch

def translate(text, max_new_tokens=64):
    prefix = "translate English to Guarani: "
    inputs = tokenizer(prefix + text, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        output_tokens = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams=4,
            do_sample=False
        )

    return tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]

text_input = widgets.Text(
    value='Where are you going?',
    placeholder='Type a sentence...',
    description='English:',
    layout=widgets.Layout(width='90%')
)

output_box = widgets.Output()
translate_button = widgets.Button(
    description='Translate to Guarani',
    button_style='success'
)

def on_translate_clicked(b):
    output_box.clear_output()
    sentence = text_input.value.strip()
    with output_box:
        if not sentence:
            print("Please enter a sentence.")
        else:
            translation = translate(sentence)
            print("Guarani:", translation)

translate_button.on_click(on_translate_clicked)

display(widgets.VBox([
    text_input,
    translate_button,
    output_box
]))

In [None]:
from evaluate import load

def eval(examples):
    inputs = [f"translate English to Guarani: {x}" for x in examples["en"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    return model_inputs

tokenized_test = test_dataset.map(eval, batched=True, remove_columns=["en", "gn"])

predictions = []
references = [ex["gn"] for ex in test_dataset]

for batch in tokenized_test:
    input_ids = torch.tensor(batch["input_ids"]).unsqueeze(0).to(model.device)
    attention_mask = torch.tensor(batch["attention_mask"]).unsqueeze(0).to(model.device)

    with torch.no_grad():
        output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=64)

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    predictions.append(decoded)

score = chrf.compute(predictions=predictions, references=[[r] for r in references])
print(f"ChrF++ score: {score['score']:.2f}")