In [None]:
!pip install transformers datasets evaluate --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# ----------------------------
# 1️⃣ Load Parallel Corpus
# ----------------------------
SRC_LANG = "eng_Latn"
TGT_LANG = "xho_Latn"

with open("/content/drive/MyDrive/English-isiXhosa/English1_cleaned_new.txt", encoding="utf-8") as f:
    en = f.read().strip().split("\n")
with open("/content/drive/MyDrive/English-isiXhosa/isiXhosa1_cleaned_new.txt", encoding="utf-8") as f:
    xh = f.read().strip().split("\n")

assert len(en) == len(xh), "❌ Mismatch in line counts!"

df = pd.DataFrame({"en": en, "xh": xh})
raw = Dataset.from_pandas(df)
dataset = raw.train_test_split(test_size=0.1)

print(f"✅ Dataset loaded: {len(dataset['train'])} train / {len(dataset['test'])} test samples")

✅ Dataset loaded: 72519 train / 8058 test samples


In [None]:
model_name = "facebook/nllb-200-distilled-600M"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Check if target language is in tokenizer's vocabulary
if tokenizer.convert_tokens_to_ids(TGT_LANG) is None:
    raise ValueError(f"Target language '{TGT_LANG}' not found in tokenizer vocabulary.")

tokenizer.src_lang = SRC_LANG
print(f"Using source language: {SRC_LANG}")
print(f"Using target language: {TGT_LANG}")


# Get forced BOS token id for isiZulu
forced_bos_token_id = tokenizer.convert_tokens_to_ids(TGT_LANG)
print("✅ Forced BOS Token ID:", forced_bos_token_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Using source language: eng_Latn
Using target language: xho_Latn
✅ Forced BOS Token ID: 256196


In [None]:
def preprocess(batch):
    src_texts = batch["en"]
    tgt_texts = batch["xh"]

    model_inputs = tokenizer(
        src_texts,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    # Explicitly set target language for tokenizer
    tokenizer.tgt_lang = TGT_LANG
    labels = tokenizer(
        text_target=tgt_texts,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized = dataset.map(preprocess, batched=True)
print("✅ Tokenization complete.")

Map:   0%|          | 0/72519 [00:00<?, ? examples/s]

Map:   0%|          | 0/8058 [00:00<?, ? examples/s]

✅ Tokenization complete.


In [None]:
import evaluate
import numpy as np

bleu = evaluate.load("bleu")

def compute_bleu(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["bleu"]}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

training_args = Seq2SeqTrainingArguments(
    output_dir="./nllb_xhosa_model",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=5,
    save_total_limit=2,
    predict_with_generate=True,
    logging_strategy="steps",
    logging_steps=10,
    generation_max_length=128,
    fp16=True
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_bleu
)

trainer.train()

  trainer = Seq2SeqTrainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mministercmanga[0m ([33mministercmanga-university-of-zululand[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Bleu
1,0.398,0.358242,0.166059
2,0.314,0.340984,0.181146
3,0.2954,0.333229,0.190053
4,0.2848,0.330289,0.193578
5,0.2693,0.330525,0.195806




TrainOutput(global_step=45325, training_loss=0.3641288733074695, metrics={'train_runtime': 33095.253, 'train_samples_per_second': 10.956, 'train_steps_per_second': 1.37, 'total_flos': 9.822266165035008e+16, 'train_loss': 0.3641288733074695, 'epoch': 5.0})

In [None]:
results = trainer.evaluate()
print("✅ BLEU Score:", results["eval_bleu"])

✅ BLEU Score: 0.19580627388487581


In [None]:
model.save_pretrained("nllb-xhosa-finetuned")
tokenizer.save_pretrained("nllb-xhosa-finetuned")
print("✅ Model saved successfully.")

✅ Model saved successfully.


In [None]:
text = "My family lives in the Eastern Cape province."

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
translated_tokens = model.generate(
    **inputs,
    forced_bos_token_id=forced_bos_token_id,
    max_length=128,
    num_beams=5
)

translated = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
print("✅ Translated isiXhosa:", translated)

✅ Translated isiXhosa: usapho lwam luhlala kwiphondo lasempuma koloni .
