## mBART Pretraining with Hybrid Gaussian Random Embeddings for Low-Resource IGT

### Predefined function

In [None]:

!pip uninstall -y numpy evaluate unbabel-comet rouge_score


!pip install numpy==1.23.5


!pip install evaluate unbabel-comet rouge_score

Found existing installation: numpy 1.23.5
Uninstalling numpy-1.23.5:
  Successfully uninstalled numpy-1.23.5
Found existing installation: evaluate 0.4.3
Uninstalling evaluate-0.4.3:
  Successfully uninstalled evaluate-0.4.3
Found existing installation: unbabel-comet 2.2.5
Uninstalling unbabel-comet-2.2.5:
  Successfully uninstalled unbabel-comet-2.2.5
Found existing installation: rouge_score 0.1.2
Uninstalling rouge_score-0.1.2:
  Successfully uninstalled rouge_score-0.1.2
Collecting numpy==1.23.5
  Using cached numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Using cached numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.23.5 which is incom

Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 377, in run
    requirement_set = resolver.resolve(
                      ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 95, in resolve
    result = self._result = resolver.resolve(
                            ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_vendor/resolvelib/resolvers.py", line 546, in resolve
    state = resolution.resolve

In [None]:
!pip install --force-reinstall evaluate unbabel-comet

Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting unbabel-comet
  Using cached unbabel_comet-2.2.5-py3-none-any.whl.metadata (19 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting numpy>=1.17 (from evaluate)
  Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting dill (from evaluate)
  Using cached dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from evaluate)
  Using cached pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting requests>=2.19.0 (from evaluate)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.62.1 (from evaluate)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from evaluate)
  Using cached xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (

In [None]:
import numpy as np
from evaluate import load
from comet import download_model, load_from_checkpoint

def evaluate_mt_model(trainer, tokenizer, tokenized_test, raw_sources):
    print("🔍 Running model predictions...")
    preds = trainer.predict(tokenized_test)
    decoded_preds = tokenizer.batch_decode(preds.predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(preds.label_ids, skip_special_tokens=True)

    print("📏 Computing BLEU, ChrF++, ROUGE, and Exact Match...")
    bleu = load("sacrebleu")
    chrf = load("chrf")
    rouge = load("rouge")

    bleu_score = bleu.compute(predictions=decoded_preds,
                              references=[[ref] for ref in decoded_labels])["score"]
    chrf_score = chrf.compute(predictions=decoded_preds,
                              references=decoded_labels)["score"]
    rouge_score = rouge.compute(predictions=decoded_preds,
                                references=decoded_labels)["rougeL"]

    exact_matches = [pred.strip() == label.strip() for pred, label in zip(decoded_preds, decoded_labels)]
    exact_match_score = sum(exact_matches) / len(exact_matches) * 100

    print("🧠 Computing COMET score...")
    model_path = download_model("Unbabel/wmt22-comet-da")
    comet_model = load_from_checkpoint(model_path)

    comet_data = [
        {"src": raw_sources[i], "mt": decoded_preds[i], "ref": decoded_labels[i]}
        for i in range(len(decoded_preds))
    ]
    comet_score = comet_model.predict(comet_data, batch_size=8, gpus=1)
    comet_mean = np.mean(comet_score.scores)

    print("\n✅ Evaluation Summary:")
    print(f"BLEU:        {bleu_score:.2f}")
    print(f"ChrF++:      {chrf_score:.2f}")
    print(f"ROUGE-L:     {rouge_score:.2f}")
    print(f"Exact Match: {exact_match_score:.2f}%")
    print(f"COMET:       {comet_mean:.4f}")

    return {
        "BLEU": bleu_score,
        "ChrF++": chrf_score,
        "ROUGE-L": rouge_score,
        "Exact Match (%)": exact_match_score,
        "COMET": comet_mean
    }

### Install and Import Required Libraries

In [None]:
!pip install transformers datasets sentencepiece accelerate

import torch
import json
import numpy as np
from datasets import load_dataset
from transformers import MBartTokenizer, MBartForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer



### Load Interlinear Glossed Text (IGT) Data
* Spanish, Gloss, and Kekchi triples

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="/content/mbart_igt_pretrain_v2.jsonl", split="train")


dataset = dataset.train_test_split(test_size=0.1, seed=42)


train_data = dataset["train"]
test_data = dataset["test"]

In [None]:
train_data[:5]

{'input': ['[SRC] jesús dijo que el tabaco no es bueno para las personas [GLOSS] jesús jesus chi chi may chi chi us chi chi chi',
  '[SRC] cuanto más compasivo fiel y desinteresado sea nuestro servicio y sacrificio por él más podremos comenzar a comprender la compasión y gracia expiatoria de jesucristo por nosotros [GLOSS] jo chi a chi chi desinteresado chi chi chi chi chi chi chi chi chi chi chi chi chi compasión chi chi of chi jesucristo chi chi',
  '[SRC] aprenda cómo enviar sus himnos o canciones originales para que se consideren incluirlas en el nuevo himnario y canciones para los niños [GLOSS] jo chi laa chi chi chi jesus chi chi chi chi lee incluirlas chi chi chi himnario chi jesus chi chi chi',
  '[SRC] y habiendo creído que era la tierra de zarahemla ellos se volvieron a la tierra de nefi llegando a los confines del país no muchos días antes de la venida de ammón [GLOSS] chi chi we chi chi chi chi chi chi chi chi a chi chi chi chi nefi laa chi chi jo del chi chi chi chi chi ch

### Load mBART-50 Model and Tokenizer
* planning - modifying its output embeddings.

In [None]:
model_name = "facebook/mbart-large-50"
tokenizer = MBartTokenizer.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'MBart50Tokenizer'. 
The class this function is called from is 'MBartTokenizer'.


### Hybrid Embedding Injection (γ * pretrained + (1 - γ) * random)
* For each token, compute a weighted average of its pretrained vector and a random Gaussian vector, then normalize it.

* Use a fixed γ value (e.g., 0.9 as used in the paper)

In [None]:
def inject_hybrid_embeddings(model, gamma=0.9):
    embedding = model.model.shared.weight.data
    vocab_size, dim = embedding.shape

    for idx in range(vocab_size):
        pretrained_vec = embedding[idx]
        rand_vec = torch.randn(dim)
        rand_vec /= rand_vec.norm()

        combined = gamma * pretrained_vec + (1 - gamma) * rand_vec
        combined /= combined.norm()
        embedding[idx] = combined

inject_hybrid_embeddings(model, gamma=0.9)

* This ensures that rare and frequent tokens are perturbed while preserving some semantic structure from the pretrained model.

### Preprocess IGT Dataset (Tokenize Inputs and Targets)

In [None]:
def preprocess_function(examples):
    inputs = ["translate Spanish to Kekchi: " + x for x in examples["input"]]
    targets = examples["target"]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

### ALL Data

In [None]:
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    max_steps=3000,
    eval_steps=500,
    save_steps=500,
    save_total_limit=1,
    logging_steps=100,
    evaluation_strategy="steps",
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)



In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test.select(range(1000)),
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.5503,0.626546
1000,0.3995,0.374769
1500,0.3667,0.333332
2000,0.329,0.307667
2500,0.3168,0.292332
3000,0.302,0.285617




TrainOutput(global_step=3000, training_loss=0.44192085456848146, metrics={'train_runtime': 1093.3379, 'train_samples_per_second': 87.805, 'train_steps_per_second': 2.744, 'total_flos': 2.6005557215232e+16, 'train_loss': 0.44192085456848146, 'epoch': 0.6468305304010349})

### Evaluation Score

In [None]:
!pip install rouge_score

Collecting rouge_score
  Using cached rouge_score-0.1.2-py3-none-any.whl
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
raw_sources = [
    x.split("[GLOSS]")[0].replace("[SRC]", "").strip()
    for x in test_data["input"]
]


evaluate_mt_model(trainer, tokenizer, tokenized_test.select(range(1000)), raw_sources[:1000])

🔍 Running model predictions...


📏 Computing BLEU, ChrF++, ROUGE, and Exact Match...
🧠 Computing COMET score...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.40k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_


✅ Evaluation Summary:
BLEU:        38.77
ChrF++:      62.79
ROUGE-L:     0.61
Exact Match: 2.10%
COMET:       0.7383


{'BLEU': 38.770361124163784,
 'ChrF++': 62.78786486217276,
 'ROUGE-L': 0.6099308128000474,
 'Exact Match (%)': 2.1,
 'COMET': 0.7382668780088425}