<a href="https://colab.research.google.com/github/nyacly/rutooro-mt-model/blob/main/notebooks/train_nllb_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tune NLLB-200 on Rutooro
This Colab notebook demonstrates how to fine-tune `facebook/nllb-200-distilled-600M` for English↔Rutooro translation.

## Setup: Clone the repository

In [1]:
# Clone latest repo version for runtime usage
!git clone https://github.com/nyacly/rutooro-mt-model.git
%cd rutooro-mt-model
!git pull origin main


Cloning into 'rutooro-mt-model'...
remote: Enumerating objects: 50, done.[K
remote: Counting objects: 100% (50/50), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 50 (delta 17), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (50/50), 31.37 KiB | 1.49 MiB/s, done.
Resolving deltas: 100% (17/17), done.
/content/rutooro-mt-model
From https://github.com/nyacly/rutooro-mt-model
 * branch            main       -> FETCH_HEAD
Already up to date.


## Mount Google Drive and set up folders

In [2]:
# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')

import os
data_dir = '/content/drive/MyDrive/rutooro-mt-data'
model_dir = '/content/drive/MyDrive/rutooro-mt-models'
output_dir = '/content/drive/MyDrive/rutooro-mt-outputs'
for d in [data_dir, model_dir, output_dir]:
    os.makedirs(d, exist_ok=True)
print('Data directory:', data_dir)
print('Model directory:', model_dir)
print('Output directory:', output_dir)


Mounted at /content/drive
Data directory: /content/drive/MyDrive/rutooro-mt-data
Model directory: /content/drive/MyDrive/rutooro-mt-models
Output directory: /content/drive/MyDrive/rutooro-mt-outputs


## Install dependencies

In [3]:
# Install all required dependencies
!pip install transformers datasets evaluate gradio sacrebleu


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: po

## Check GPU availability

In [12]:
import torch
print('GPU available:', torch.cuda.is_available())


GPU available: True


## Load and preprocess the dataset

In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from evaluate import load as load_metric

dataset = load_dataset('michsethowusu/english-tooro_sentence-pairs_mt560')['train']
# Split 80:10:10
splits = dataset.train_test_split(test_size=0.2, seed=42)
val_test = splits['test'].train_test_split(test_size=0.5, seed=42)
train_ds = splits['train']
val_ds = val_test['train']
test_ds = val_test['test']


## Initialize model and tokenizer

In [14]:
model_name = 'facebook/nllb-200-distilled-600M'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer.src_lang = 'eng_Latn'
tokenizer.tgt_lang = 'ttj_Latn'


### Tokenization helper

In [8]:
print(train_ds[0])
print(train_ds.column_names)

{'eng': 'But if we are not careful , in time we could forget how valuable these treasures are .', 'ttj': "Baitu kakuba tutegendereza , obwire nibusobora kuhika nitwebwa ngu ebintu binu by 'omuhendo ."}
['eng', 'ttj']


In [17]:
def preprocess(example):
    inputs = example['eng']
    targets = example['ttj']
    model_inputs = tokenizer(inputs, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_enc = train_ds.map(preprocess, batched=True)
val_enc = val_ds.map(preprocess, batched=True)
test_enc = test_ds.map(preprocess, batched=True)


Map:   0%|          | 0/13153 [00:00<?, ? examples/s]

Map:   0%|          | 0/1644 [00:00<?, ? examples/s]

Map:   0%|          | 0/1645 [00:00<?, ? examples/s]

## Save datasets to Google Drive

In [16]:
# Store processed datasets for later reuse
train_enc.save_to_disk(f'{data_dir}/train_enc')
val_enc.save_to_disk(f'{data_dir}/val_enc')
test_enc.save_to_disk(f'{data_dir}/test_enc')


NameError: name 'data_dir' is not defined

## Training setup

In [6]:
!pip install --upgrade transformers datasets evaluate sacrebleu



In [2]:
import transformers
print(transformers.__file__)
print(transformers.__version__)


/usr/local/lib/python3.11/dist-packages/transformers/__init__.py
4.54.0


In [6]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    output_dir=model_dir,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    save_total_limit=2,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
    save_strategy="epoch",
    eval_strategy="epoch",
)


## Train and evaluate

In [18]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "facebook/nllb-200-distilled-600M"   # or your desired checkpoint

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)


In [19]:
# After defining preprocess and loading train_ds/val_ds
train_enc = train_ds.map(preprocess, batched=True)
val_enc = val_ds.map(preprocess, batched=True)
test_enc = test_ds.map(preprocess, batched=True)


Map:   0%|          | 0/13153 [00:00<?, ? examples/s]

TypeError: 'NoneType' object is not iterable

In [10]:
from transformers import EarlyStoppingCallback

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    bleu = metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    return {'bleu': bleu['score']}

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_enc,
    eval_dataset=val_enc,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)


NameError: name 'train_enc' is not defined

### Start training

In [None]:
trainer.train()

## Save the model

In [None]:
trainer.save_model(model_dir)


## Load model from Google Drive

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)


## Run the Gradio demo

In [None]:
import gradio as gr
import app.gradio_demo as demo
demo.MODEL_DIR = model_dir
from app.gradio_demo import translate
iface = gr.Interface(fn=lambda txt: translate(txt, 'en-ttj'), inputs='text', outputs='text')
iface.launch()


### Next steps
You can now use the saved model in `./model` or run the demo above to interactively translate between English and Rutooro.