In [None]:
!pip install -U transformers datasets evaluate accelerate sacrebleu --quiet # install if not installed already


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# initialization block
import os
import xml.etree.ElementTree as ET
from datasets import Dataset
import torch
import evaluate
from transformers import (
    NllbTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

base_path = "/content/drive/MyDrive/Colab Notebooks/CompLing" # change if needed to file path where datasets are stored
tmx_path = os.path.join(base_path, "en-gn.tmx")
en_devtest_path = os.path.join(base_path, "eng_Latn.devtest")
gn_devtest_path = os.path.join(base_path, "grn_Latn.devtest")

# load .tmx into a huggingface set for training
def load_tmx_as_dataset_fast(tmx_path, source_lang="en", target_lang="gn"):
    examples = []
    for event, elem in ET.iterparse(tmx_path, events=("end",)):
        if elem.tag == "tu":
            segs = {}
            for tuv in elem.findall("tuv"):
                lang = tuv.attrib.get("{http://www.w3.org/XML/1998/namespace}lang")
                seg = tuv.find("seg")
                if lang and seg is not None and seg.text:
                    segs[lang.lower()] = seg.text.strip()
            if source_lang in segs and target_lang in segs:
                examples.append({
                    "translation": {
                        source_lang: segs[source_lang],
                        target_lang: segs[target_lang]
                    }
                })
            elem.clear()
    return Dataset.from_list(examples)

#load flores for evaluation
def load_flores_for_bleu(english_path: str, guarani_path: str) -> Dataset:
    with open(english_path, encoding="utf-8") as f_en:
        en_lines = [line.strip() for line in f_en if line.strip()]
    with open(guarani_path, encoding="utf-8") as f_gn:
        gn_lines = [line.strip() for line in f_gn if line.strip()]
    if len(en_lines) != len(gn_lines):
        raise ValueError(f"Line mismatch: {len(en_lines)} English vs {len(gn_lines)} Guarani.")
    data = [{"translation": {"en": en, "gn": gn}} for en, gn in zip(en_lines, gn_lines)]
    return Dataset.from_list(data)
# extract translation columns
def extract_translation_columns(dataset):
    return dataset.map(lambda x: {
        "en": x["translation"]["en"],
        "gn": x["translation"]["gn"]
    })
# initialize data
train_dataset = load_tmx_as_dataset_fast(tmx_path, source_lang="en", target_lang="gn")
test_dataset = load_flores_for_bleu(en_devtest_path, gn_devtest_path)

train_dataset = extract_translation_columns(train_dataset)
test_dataset = extract_translation_columns(test_dataset)
# normalize orthographies for glottal stop between test and train sets
def normalize_glottal_stop(text):
    return text.replace("\\'", "ʼ").replace("'", "ʼ")

def normalize_dataset(dataset):
    return dataset.map(lambda x: {
        "en": x["en"],
        "gn": normalize_glottal_stop(x["gn"])
    })

train_dataset = normalize_dataset(train_dataset)
test_dataset = normalize_dataset(test_dataset)
train_dataset = train_dataset.select(range(10000)) # select smaller range for testing
test_dataset = test_dataset.select(range(500))

Map:   0%|          | 0/2959122 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

Map:   0%|          | 0/2959122 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

In [None]:
# model block
model_name = "facebook/nllb-200-distilled-600M"
SRC_LANG = "eng_Latn"
TGT_LANG = "grn_Latn"
# get tokenizer for NLLB
tokenizer = NllbTokenizer.from_pretrained(model_name)
tokenizer.src_lang = SRC_LANG
tokenizer.tgt_lang = TGT_LANG
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# preprocessing function
def preprocess(batch):
    src_texts = batch["en"]
    tgt_texts = batch["gn"]

    model_inputs = tokenizer(
        src_texts,
        max_length=64,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            tgt_texts,
            max_length=64,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
# tokenize data to be passed into training loop
tokenized_train = train_dataset.map(preprocess, batched=True, remove_columns=["en", "gn"])
tokenized_test = test_dataset.map(preprocess, batched=True, remove_columns=["en", "gn"])

tokenized_train.set_format("torch")
tokenized_test.set_format("torch")

# training arguments with hyperparameters, do not change
training_args = Seq2SeqTrainingArguments(
    output_dir="./nllb_en2gn",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    max_steps=2000,
    weight_decay=0.01,
    save_total_limit=1,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(),
    logging_dir='./logs',
    logging_steps=100,
    eval_steps=500,
    save_steps=1000,
    report_to="none"
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    pad_to_multiple_of=8
)
# initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


In [None]:
# Tain block
trainer.train()

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss
100,7.9306
200,5.4657
300,4.3687
400,3.441
500,2.6087
600,1.8316
700,1.2003
800,0.829
900,0.6835
1000,0.5828




TrainOutput(global_step=2000, training_loss=1.6962905406951905, metrics={'train_runtime': 2598.8901, 'train_samples_per_second': 24.626, 'train_steps_per_second': 0.77, 'total_flos': 8655415780835328.0, 'train_loss': 1.6962905406951905, 'epoch': 6.3904})

In [None]:
# Saving Block
output_dir = "/content/drive/MyDrive/Colab Notebooks/CompLing/nllb_en2gn"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

('/content/drive/MyDrive/Colab Notebooks/CompLing/nllb_en2gn/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/CompLing/nllb_en2gn/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/CompLing/nllb_en2gn/sentencepiece.bpe.model',
 '/content/drive/MyDrive/Colab Notebooks/CompLing/nllb_en2gn/added_tokens.json')

In [None]:
# eval block
from torch.utils.data import DataLoader
from transformers import GenerationConfig
from evaluate import load
import torch
# make sure it can predict for evaluation
trainer.args.prediction_loss_only = False
# empty cache for memory
torch.cuda.empty_cache()
tokenized_test.set_format(type="torch")
eval_dataloader = DataLoader(tokenized_test, batch_size=4)

model.eval()
model.to("cuda")

generated_preds = []
reference_labels = []
# manual evaluation loop
for batch in eval_dataloader:
    input_ids = batch["input_ids"].to("cuda")
    attention_mask = batch["attention_mask"].to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=64,
            num_beams=1,
            do_sample=False
        )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

    generated_preds.extend(decoded)
    reference_labels.extend([[label] for label in labels])

# ChrF++ Evaluation
chrf = load("chrf")
score = chrf.compute(predictions=generated_preds, references=reference_labels)

print(f"ChrF++: {score['score']:.2f}")

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

ChrF++: 38.61


In [None]:
import ipywidgets as widgets
from IPython.display import display
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
import torch

# Load model and tokenizer
model_path = "/content/drive/MyDrive/Colab Notebooks/CompLing/nllb_en2gn"

tokenizer = NllbTokenizer.from_pretrained(model_path)
tokenizer.src_lang = "eng_Latn"
tokenizer.tgt_lang = "grn_Latn"

device = "cuda" if torch.cuda.is_available() else "cpu" # use GPU if able
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
# define a translate function
def translate(text, max_new_tokens=64):
    inputs = tokenizer(text, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        output_tokens = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams=4,
            do_sample=False
        )

    return tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]
# widget initialization
text_input = widgets.Text(
    value='Where are you going?',
    placeholder='Type a sentence...',
    description='English:',
    layout=widgets.Layout(width='90%')
)

output_box = widgets.Output()
translate_button = widgets.Button(
    description='Translate to Guarani',
    button_style='success'
)

def on_translate_clicked(b):
    output_box.clear_output()
    sentence = text_input.value.strip()
    with output_box:
        if not sentence:
            print("Please enter a sentence.")
        else:
            translation = translate(sentence)
            print("Guarani:", translation)

translate_button.on_click(on_translate_clicked)

display(widgets.VBox([
    text_input,
    translate_button,
    output_box
]))

VBox(children=(Text(value='Where are you going?', description='English:', layout=Layout(width='90%'), placehol…