In [None]:
!pip install transformers datasets tokenizers accelerate

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (f

## Original  Dataset aligned sentences

In [None]:
from google.colab import files

uploaded = files.upload()  # aligned_sentences_clean.txt

Saving aligned_sentences_clean.txt to aligned_sentences_clean.txt


In [None]:
file_path = "/content/aligned_sentences_clean.txt"

## Parse the alternating lines into pairs

In [None]:
spanish_sentences = []
kekchi_sentences = []

with open("/content/aligned_sentences_clean.txt", "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if line.strip()]

for i in range(0, len(lines), 2):
    if lines[i].startswith("Spanish:") and lines[i+1].startswith("Kekchi:"):
        spanish = lines[i].replace("Spanish:", "").strip()
        kekchi = lines[i+1].replace("Kekchi:", "").strip()
        spanish_sentences.append(spanish)
        kekchi_sentences.append(kekchi)

print(f"Loaded {len(spanish_sentences)} sentence pairs.")

Loaded 164903 sentence pairs.


## Convert into JSONL format (for HuggingFace)

In [None]:
import json

with open("spanish_kekchi.jsonl", "w", encoding="utf-8") as f:
    for s, k in zip(spanish_sentences, kekchi_sentences):
        json_line = {"source": s, "target": k}
        f.write(json.dumps(json_line, ensure_ascii=False) + "\n")

##  Load the JSONL into a HuggingFace Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="spanish_kekchi.jsonl", split="train")
dataset = dataset.train_test_split(test_size=0.1)

train_data = dataset["train"]
test_data = dataset["test"]

Generating train split: 0 examples [00:00, ? examples/s]

## Load Pretrained Tokenizer & Model (e.g., mbart-large-50)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/mbart-large-50"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

## Tokenize the Dataset

In [None]:
def preprocess_function(examples):
    inputs = ["translate Spanish to Kekchi: " + ex for ex in examples["source"]]
    targets = examples["target"]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/148412 [00:00<?, ? examples/s]

Map:   0%|          | 0/16491 [00:00<?, ? examples/s]

##  Define Training Arguments

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,               # Just 3 quick passes
    fp16=True,                        # Mixed precision for speed (if GPU)
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none",
    predict_with_generate=True
)



## Start Training with Seq2SeqTrainer

In [None]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

### subset of data

In [None]:
tokenized_train = tokenized_train.select(range(500))  # Only 500 training examples
tokenized_test = tokenized_test.select(range(100))    # Only 100 for testing

In [None]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,No log,0.355927
2,No log,0.390928
3,No log,0.460051
4,No log,0.527683
5,No log,0.567761
6,No log,0.592546
7,No log,0.607654
8,0.093700,0.610027
9,0.093700,0.616495
10,0.093700,0.617938




TrainOutput(global_step=630, training_loss=0.07495230985066248, metrics={'train_runtime': 427.2932, 'train_samples_per_second': 11.702, 'train_steps_per_second': 1.474, 'total_flos': 1354456104960000.0, 'train_loss': 0.07495230985066248, 'epoch': 10.0})

### Evaluate Model on the Test Set

In [None]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.6179380416870117, 'eval_runtime': 1.1722, 'eval_samples_per_second': 85.312, 'eval_steps_per_second': 11.091, 'epoch': 10.0}


###  Compute BLEU Score on Predictions

In [None]:
!pip install evaluate sacrebleu --quiet

from evaluate import load
import numpy as np

bleu = load("sacrebleu")

# Get model predictions
preds = trainer.predict(tokenized_test)
decoded_preds = tokenizer.batch_decode(preds.predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(preds.label_ids, skip_special_tokens=True)

# Prepare references in expected format
bleu_score = bleu.compute(predictions=decoded_preds,
                          references=[[ref] for ref in decoded_labels])

print(f"\n🌍 BLEU score: {bleu_score['score']:.2f}")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]


🌍 BLEU score: 22.26


### Comet Score

In [None]:
!pip install -q unbabel-comet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.4/101.4 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m106.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.0/823.0 kB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m529.7/529.7 kB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency res

In [None]:
from comet import download_model, load_from_checkpoint


model_path = download_model("Unbabel/wmt22-comet-da")
comet_model = load_from_checkpoint(model_path)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.40k [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


### Comet Inputs

In [None]:
comet_data = [
    {"src": tokenized_test[i]["source"],
     "mt": decoded_preds[i],
     "ref": decoded_labels[i]}
    for i in range(len(decoded_preds))
]

### Comet scoring

In [None]:
import numpy as np

comet_score = comet_model.predict(comet_data, batch_size=8, gpus=1)
mean_score = np.mean(comet_score.scores)
print(f"⚡ COMET score: {mean_score:.4f}")


INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 13/13 [00:00<00:00, 14.33it/s]


⚡ COMET score: 0.6425


### ChrF++

In [None]:
chrf = load("chrf")
chrf_score = chrf.compute(predictions=decoded_preds, references=decoded_labels)
print(f"ChrF++ score: {chrf_score['score']:.2f}")

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

ChrF++ score: 43.13


### ROUGE-L

In [None]:
!pip install rouge_score --quiet


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone


In [None]:
from evaluate import load

rouge = load("rouge")
rouge_score = rouge.compute(predictions=decoded_preds, references=decoded_labels)
print(f"ROUGE-L score: {rouge_score['rougeL']:.2f}")

ROUGE-L score: 0.44


### Exact Match

In [None]:
exact_matches = [pred.strip() == label.strip() for pred, label in zip(decoded_preds, decoded_labels)]
exact_match_score = sum(exact_matches) / len(exact_matches) * 100
print(f"Exact Match: {exact_match_score:.2f}%")

Exact Match: 2.00%


## Function for all the scores

In [None]:
def evaluate_mt_model(trainer, tokenizer, tokenized_test):
    from evaluate import load
    from comet import download_model, load_from_checkpoint
    import numpy as np

    print("🔍 Running model predictions...")
    preds = trainer.predict(tokenized_test)
    decoded_preds = tokenizer.batch_decode(preds.predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(preds.label_ids, skip_special_tokens=True)

    print("📏 Computing BLEU, ChrF++, ROUGE, and Exact Match...")
    bleu = load("sacrebleu")
    chrf = load("chrf")
    rouge = load("rouge")

    bleu_score = bleu.compute(predictions=decoded_preds,
                              references=[[ref] for ref in decoded_labels])["score"]
    chrf_score = chrf.compute(predictions=decoded_preds,
                              references=decoded_labels)["score"]
    rouge_score = rouge.compute(predictions=decoded_preds,
                                references=decoded_labels)["rougeL"]

    exact_matches = [pred.strip() == label.strip() for pred, label in zip(decoded_preds, decoded_labels)]
    exact_match_score = sum(exact_matches) / len(exact_matches) * 100

    print("🧠 Computing COMET score...")
    model_path = download_model("Unbabel/wmt22-comet-da")
    comet_model = load_from_checkpoint(model_path)

    comet_data = [
        {"src": tokenized_test[i]["source"], "mt": decoded_preds[i], "ref": decoded_labels[i]}
        for i in range(len(decoded_preds))
    ]
    comet_score = comet_model.predict(comet_data, batch_size=8, gpus=1)
    comet_mean = np.mean(comet_score.scores)

    print("\n✅ Evaluation Summary:")
    print(f"BLEU:        {bleu_score:.2f}")
    print(f"ChrF++:      {chrf_score:.2f}")
    print(f"ROUGE-L:     {rouge_score:.2f}")
    print(f"Exact Match: {exact_match_score:.2f}%")
    print(f"COMET:       {comet_mean:.4f}")

    return {
        "BLEU": bleu_score,
        "ChrF++": chrf_score,
        "ROUGE-L": rouge_score,
        "Exact Match (%)": exact_match_score,
        "COMET": comet_mean
    }


## Full data faster

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,         # Tune if needed (try 8 or 32 too)
    gradient_accumulation_steps=2,          # Simulates larger batch
    max_steps=3000,                         # ✅ Train for ~3000 updates (not full epoch)
    eval_steps=500,
    save_steps=500,
    save_total_limit=1,
    logging_steps=100,
    evaluation_strategy="steps",
    predict_with_generate=True,
    fp16=True,                              # Mixed precision for speed
    report_to="none"
)



In [None]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test.select(range(1000)),
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.2611,0.332958
1000,0.3036,0.29853
1500,0.2844,0.278589
2000,0.2731,0.267525
2500,0.2689,0.258573
3000,0.2669,0.254527


TrainOutput(global_step=3000, training_loss=0.2755169010162353, metrics={'train_runtime': 1128.7229, 'train_samples_per_second': 85.052, 'train_steps_per_second': 2.658, 'total_flos': 2.6005557215232e+16, 'train_loss': 0.2755169010162353, 'epoch': 0.6468305304010349})

## Evaluate

In [None]:
evaluate_mt_model(trainer, tokenizer, tokenized_test.select(range(1000)))

🔍 Running model predictions...


📏 Computing BLEU, ChrF++, ROUGE, and Exact Match...
🧠 Computing COMET score...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:p


✅ Evaluation Summary:
BLEU:        25.43
ChrF++:      46.95
ROUGE-L:     0.41
Exact Match: 0.50%
COMET:       0.6366


{'BLEU': 25.432607338788245,
 'ChrF++': 46.948690911816676,
 'ROUGE-L': np.float64(0.4084662529765539),
 'Exact Match (%)': 0.5,
 'COMET': np.float64(0.6366222840696574)}