# Fine-tune NLLB-200 on Rutooro
This Colab notebook demonstrates how to fine-tune `facebook/nllb-200-distilled-600M` for English↔Rutooro translation.

In [None]:
!pip install -q transformers datasets evaluate sacrebleu gradio

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from evaluate import load as load_metric

In [None]:
dataset = load_dataset("michsethowusu/english-tooro_sentence-pairs_mt560")
train_ds = dataset["train"]

In [None]:
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer.src_lang = "eng_Latn"
tokenizer.tgt_lang = "ttj_Latn"

In [None]:
def preprocess(example):
    inputs = example["english"]
    targets = example["rutooro"]
    model_inputs = tokenizer(inputs, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

processed = train_ds.map(preprocess, batched=True)

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir="./model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_total_limit=2,
    predict_with_generate=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = load_metric("sacrebleu")

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    bleu = metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    return {"bleu": bleu["score"]}

trainer = Seq2SeqTrainer(model=model, args=args, train_dataset=processed, eval_dataset=processed, data_collator=data_collator, tokenizer=tokenizer, compute_metrics=compute_metrics)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./model")

In [None]:
import gradio as gr
from app.gradio_demo import translate

iface = gr.Interface(fn=lambda txt: translate(txt, "en-ttj"), inputs="text", outputs="text")
iface.launch()