In [1]:
!pip install -U transformers datasets evaluate accelerate sacrebleu --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
import transformers
print(transformers.__version__)

4.51.3


In [3]:
import torch
import os
import datasets
import evaluate
import tqdm
from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import SGD


import evaluate
import torch


In [4]:
def load_parallel_text_data(en_path, gn_path):
    with open(en_path, encoding="utf-8") as f_en:
        en_lines = [line.strip() for line in f_en if line.strip()]
    with open(gn_path, encoding="utf-8") as f_gn:
        gn_lines = [line.strip() for line in f_gn if line.strip()]
    if len(en_lines) != len(gn_lines):
        raise ValueError(f"Line mismatch: {len(en_lines)} English vs {len(gn_lines)} Guarani.")
    return datasets.Dataset.from_list([{"en": en, "gn": gn} for en, gn in zip(en_lines, gn_lines)])

def load_flores_for_bleu(english_path, guarani_path):
    with open(english_path, encoding="utf-8") as f_en:
        en_lines = [line.strip() for line in f_en if line.strip()]
    with open(guarani_path, encoding="utf-8") as f_gn:
        gn_lines = [line.strip() for line in f_gn if line.strip()]
    if len(en_lines) != len(gn_lines):
        raise ValueError(f"Line mismatch: {len(en_lines)} English vs {len(gn_lines)} Guarani.")
    data = [{"en": en, "gn": gn} for en, gn in zip(en_lines, gn_lines)]
    return datasets.Dataset.from_list(data)

base_path = "/content/drive/MyDrive/BYT5 Files"
train_en_path = f"{base_path}/NLLB.en-gn.en"
train_gn_path = f"{base_path}/NLLB.en-gn.gn"
en_devtest_path = f"{base_path}/eng_Latn.devtest"
gn_devtest_path = f"{base_path}/grn_Latn.devtest"

train_dataset = load_parallel_text_data(train_en_path, train_gn_path)
test_dataset = load_flores_for_bleu(en_devtest_path, gn_devtest_path)

def normalize_glottal_stop(text):
    return text.replace("\'", "ʼ").replace("'", "ʼ")

def normalize_dataset(dataset):
    return dataset.map(lambda x: {
        "en": x["en"],
        "gn": normalize_glottal_stop(x["gn"])
    })

train_dataset = normalize_dataset(train_dataset)
test_dataset = normalize_dataset(test_dataset)

train_dataset = train_dataset.shuffle().select(range(10000))
test_dataset = test_dataset.select(range(500))


Map:   0%|          | 0/2959122 [00:00<?, ? examples/s]

Map:   0%|          | 0/1012 [00:00<?, ? examples/s]

In [5]:
train_dataset[:5]

{'en': ["the town's.",
  'God tells us that "the blood of Jesus His Son purifies us from all sin" (1 John 1:7).',
  'There is no specific time for marriage - it\'s not something you can be late for," she says.',
  'As such, it is wise',
  'Let me tell you my concerns.'],
 'gn': ['tava rehegua.',
  'Javyʼa jaikuaávo la Biblia heʼiha: "Jesús ruguy ñanemopotĩ opa ñane rembiapo vaikuégui" (1 Juan 1:7).',
  'Neĩ peteĩ hendaʼi py ma voi ejapo eʼỹ eme ndeayvuague - heʼi.',
  'Upéva hína arandu',
  'ehejána taʼe ndéve che pyʼapype ohasáva']}

In [6]:
test_dataset[:5]

{'en': ['"We now have 4-month-old mice that are non-diabetic that used to be diabetic," he added.',
  'Dr. Ehud Ur, professor of medicine at Dalhousie University in Halifax, Nova Scotia and chair of the clinical and scientific division of the Canadian Diabetes Association cautioned that the research is still in its early days.',
  'Like some other experts, he is skeptical about whether diabetes can be cured, noting that these findings have no relevance to people who already have Type 1 diabetes.',
  'On Monday, Sara Danius, permanent secretary of the Nobel Committee for Literature at the Swedish Academy, publicly announced during a radio program on Sveriges Radio in Sweden the committee, unable to reach Bob Dylan directly about winning the 2016 Nobel Prize in Literature, had abandoned its efforts to reach him.',
  'Danius said, "Right now we are doing nothing. I have called and sent emails to his closest collaborator and received very friendly replies. For now, that is certainly enough

In [7]:
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
def preprocess_function(examples):
    inputs = [f"translate English to Guarani: {ex}" for ex in examples["en"]]
    targets = examples["gn"]

    model_inputs = tokenizer(
        inputs, max_length=128, padding="max_length", truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=128, padding="max_length", truncation=True
        )

    labels["input_ids"] = [
        [token if token != tokenizer.pad_token_id else -100 for token in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [9]:
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["en", "gn"])
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=["en", "gn"])

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100
)
chrf = evaluate.load("chrf")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return chrf.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])



Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/byt5-gn-output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    predict_with_generate=True,
    save_strategy="epoch",
    save_total_limit=1,
    report_to=[]
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


In [13]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/byt5-gn-output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_strategy="epoch",
    predict_with_generate=True,
    report_to=[]  # disables wandb/tensorboard
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    data_collator=data_collator
)

model.to(training_args.device)

  trainer = Seq2SeqTrainer(


T5ForConditionalGeneration(
  (shared): Embedding(384, 1472)
  (encoder): T5Stack(
    (embed_tokens): Embedding(384, 1472)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1472, out_features=384, bias=False)
              (k): Linear(in_features=1472, out_features=384, bias=False)
              (v): Linear(in_features=1472, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=1472, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1472, out_features=3584, bias=False)
              (wi_1): Linear(in_features=1472, out_features=3584, bias=False)
              (w

In [14]:
trainer.train()

🚀 Training on 10k samples...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,2.2827
1000,1.7663
1500,1.6513
2000,1.5817
2500,1.5427
3000,1.5192
3500,1.4981


TrainOutput(global_step=3750, training_loss=1.6783177001953125, metrics={'train_runtime': 2578.2306, 'train_samples_per_second': 11.636, 'train_steps_per_second': 1.454, 'total_flos': 6890621829120000.0, 'train_loss': 1.6783177001953125, 'epoch': 3.0})

In [18]:
trainer.save_model("/content/drive/MyDrive/BYT5 Files")
tokenizer.save_pretrained("/content/drive/MyDrive/BYT5 Files")

('/content/drive/MyDrive/BYT5 Files/tokenizer_config.json',
 '/content/drive/MyDrive/BYT5 Files/special_tokens_map.json',
 '/content/drive/MyDrive/BYT5 Files/added_tokens.json')

In [17]:
import ipywidgets as widgets
from IPython.display import display
from transformers import AutoModelForSeq2SeqLM, NllbTokenizer
import torch

def translate(text, max_new_tokens=64):
    prefix = "translate English to Guarani: "
    inputs = tokenizer(prefix + text, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        output_tokens = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            num_beams=4,
            do_sample=False
        )

    return tokenizer.batch_decode(output_tokens, skip_special_tokens=True)[0]

text_input = widgets.Text(
    value='Where are you going?',
    placeholder='Type a sentence...',
    description='English:',
    layout=widgets.Layout(width='90%')
)

output_box = widgets.Output()
translate_button = widgets.Button(
    description='Translate to Guarani',
    button_style='success'
)

def on_translate_clicked(b):
    output_box.clear_output()
    sentence = text_input.value.strip()
    with output_box:
        if not sentence:
            print("Please enter a sentence.")
        else:
            translation = translate(sentence)
            print("Guarani:", translation)

translate_button.on_click(on_translate_clicked)

display(widgets.VBox([
    text_input,
    translate_button,
    output_box
]))

VBox(children=(Text(value='Where are you going?', description='English:', layout=Layout(width='90%'), placehol…

In [21]:
from evaluate import load

chrf = load("chrf")

def preprocess_eval(examples):
    inputs = [f"translate English to Guarani: {x}" for x in examples["en"]]
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    return model_inputs

tokenized_test = test_dataset.map(preprocess_eval, batched=True, remove_columns=["en", "gn"])

predictions = []
references = [ex["gn"] for ex in test_dataset]

for batch in tokenized_test:
    input_ids = torch.tensor(batch["input_ids"]).unsqueeze(0).to(model.device)
    attention_mask = torch.tensor(batch["attention_mask"]).unsqueeze(0).to(model.device)

    with torch.no_grad():
        output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=64)

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    predictions.append(decoded)

score = chrf.compute(predictions=predictions, references=[[r] for r in references])
print(f"ChrF++ score: {score['score']:.2f}")

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

ChrF++ score: 10.17
