# Fine-tune NLLB-200 on Rutooro
This Colab notebook demonstrates how to fine-tune `facebook/nllb-200-distilled-600M` for English↔Rutooro translation.

## Setup: Clone the repository

In [None]:
# Clone latest repo version for runtime usage
!git clone https://github.com/nyacly/rutooro-mt-model.git
%cd rutooro-mt-model
!git pull origin main


## Mount Google Drive and set up folders

In [None]:
# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')

import os
data_dir = '/content/drive/MyDrive/rutooro-mt-data'
model_dir = '/content/drive/MyDrive/rutooro-mt-models'
output_dir = '/content/drive/MyDrive/rutooro-mt-outputs'
for d in [data_dir, model_dir, output_dir]:
    os.makedirs(d, exist_ok=True)
print('Data directory:', data_dir)
print('Model directory:', model_dir)
print('Output directory:', output_dir)


## Install dependencies

In [None]:
# Install all required dependencies
!pip install transformers datasets evaluate gradio sacrebleu


## Check GPU availability

In [None]:
import torch
print('GPU available:', torch.cuda.is_available())


## Load and preprocess the dataset

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from evaluate import load as load_metric

dataset = load_dataset('michsethowusu/english-tooro_sentence-pairs_mt560')['train']
# Split 80:10:10
splits = dataset.train_test_split(test_size=0.2, seed=42)
val_test = splits['test'].train_test_split(test_size=0.5, seed=42)
train_ds = splits['train']
val_ds = val_test['train']
test_ds = val_test['test']


## Initialize model and tokenizer

In [None]:
model_name = 'facebook/nllb-200-distilled-600M'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer.src_lang = 'eng_Latn'
tokenizer.tgt_lang = 'ttj_Latn'


### Tokenization helper

In [None]:
def preprocess(example):
    inputs = example['english']
    targets = example['rutooro']
    model_inputs = tokenizer(inputs, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_enc = train_ds.map(preprocess, batched=True)
val_enc = val_ds.map(preprocess, batched=True)
test_enc = test_ds.map(preprocess, batched=True)


## Save datasets to Google Drive

In [None]:
# Store processed datasets for later reuse
train_enc.save_to_disk(f'{data_dir}/train_enc')
val_enc.save_to_disk(f'{data_dir}/val_enc')
test_enc.save_to_disk(f'{data_dir}/test_enc')


## Training setup

In [None]:
args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
metric = load_metric('sacrebleu')


## Train and evaluate

In [None]:
from transformers import EarlyStoppingCallback

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    bleu = metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    return {'bleu': bleu['score']}

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_enc,
    eval_dataset=val_enc,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


### Start training

In [None]:
trainer.train()

## Save the model

In [None]:
trainer.save_model(model_dir)


## Load model from Google Drive

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)


## Run the Gradio demo

In [None]:
import gradio as gr
import app.gradio_demo as demo
demo.MODEL_DIR = model_dir
from app.gradio_demo import translate
iface = gr.Interface(fn=lambda txt: translate(txt, 'en-ttj'), inputs='text', outputs='text')
iface.launch()


### Next steps
You can now use the saved model in `./model` or run the demo above to interactively translate between English and Rutooro.