## IBM Model 2

### Install NLTK and required packages

In [None]:
!pip install nltk --quiet
import nltk
nltk.download('punkt')
nltk.download('perluniprops')

from nltk.translate import IBMModel2
from nltk.translate.api import AlignedSent
import os
import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Unzipping misc/perluniprops.zip.


### Load aligned sentence pairs

In [None]:

file_path = "/content/aligned_sentences_clean.txt"

if not os.path.exists(file_path):
    raise FileNotFoundError(f" File not found: {file_path}")

spanish_sentences_clean = []
kekchi_sentences_clean = []

with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()
    for i in range(0, len(lines), 3):
        if i + 1 < len(lines):
            sp = lines[i].replace("Spanish:", "").strip()
            kq = lines[i + 1].replace("Kekchi:", "").strip()
            spanish_sentences_clean.append(sp.split())
            kekchi_sentences_clean.append(kq.split())

print(f"Loaded {len(spanish_sentences_clean)} aligned sentence pairs.")


Loaded 164903 aligned sentence pairs.


### Format for IBMModel2 (Kekchi = source, Spanish = target)

In [None]:
aligned_sentences = [AlignedSent(sp, kq) for sp, kq in zip(spanish_sentences_clean, kekchi_sentences_clean)]

In [None]:
aligned_sentences[:5]

[AlignedSent(['a', 'continuación', 'le', 'pregunté', 'y', 'está', 'usted', 'dispuesto', 'a', 'pagar', 'un', 'precio', 'tan', 'alto', 'por', 'el', 'evangelio'], ['laain', 'xinpatz', 're', 'ma', 'wan', 'aachool', 'chixtojbal', 'xnimal', 'li', 'tzaq', 'ain', 'choq', 're', 'li', 'evangelio'], Alignment([])),
 AlignedSent(['el', 'libro', 'de', 'mormón', 'doctrina', 'del', 'evangelio', 'manual', 'para', 'el', 'maestro'], ['el', 'libro', 'de', 'mormón', 'doctrina', 'del', 'evangelio', 'manual', 'para', 'el', 'maestro'], Alignment([])),
 AlignedSent(['el', 'libro', 'de', 'mormón', 'guía', 'de', 'estudio', 'para', 'el', 'miembro', 'de', 'la', 'clase'], ['el', 'libro', 'de', 'mormón', 'guía', 'de', 'estudio', 'para', 'el', 'miembro', 'de', 'la', 'clase'], Alignment([])),
 AlignedSent(['sí', 'abriré', 'el', 'corazón', 'de', 'los', 'del', 'pueblo', 'y', 'te', 'recibirán'], ['relik', 'chi', 'yaal', 'laain', 'tinte', 'xchooleb', 'li', 'tenamit', 'ut', 'eb', 'aan', 'tatexkul'], Alignment([])),
 Align

### Train IBM Model 2

In [None]:
print("Training IBM Model 2 for 20 iterations...")
ibm2 = IBMModel2(aligned_sentences, iterations=20)
print("IBM Model 2 training complete!")

Training IBM Model 2 for 20 iterations...
IBM Model 2 training complete!


In [None]:
ibm2

<nltk.translate.ibm2.IBMModel2 at 0x7d154d94f8d0>

### Save the model for future use

In [None]:
!pip install dill

Collecting dill
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Downloading dill-0.3.9-py3-none-any.whl (119 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m112.6/119.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.4/119.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dill
Successfully installed dill-0.3.9


In [None]:
import dill

model_path = "/content/ibm_model2_trained.pkl"
with open(model_path, "wb") as f:
    dill.dump(ibm2, f)

print(f"Saved trained model to '{model_path}'")

NameError: name 'ibm2' is not defined

In [None]:
with open(model_path, "rb") as f:
    ibm2 = dill.load(f)

EOFError: Ran out of input

## Use IBM Model 2 to Extract Gloss Triples

In [None]:
!ls -lh /content/ibm_model2_trained.pkl

-rw-r--r-- 1 root root 0 Apr  2 17:07 /content/ibm_model2_trained.pkl


### Aligned Sentences

In [None]:
def load_aligned_sentences(file_path):
    spanish = []
    kekchi = []

    with open(file_path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]

    for i in range(0, len(lines) - 1, 2):  # Stop before last line if odd
        if lines[i].startswith("Spanish:") and lines[i+1].startswith("Kekchi:"):
            sp = lines[i].replace("Spanish: ", "").strip()
            ke = lines[i+1].replace("Kekchi: ", "").strip()
            spanish.append(sp.split())
            kekchi.append(ke.split())
        else:
            print(f"Skipping unpaired or malformed lines at {i}: {lines[i:i+2]}")

    return list(zip(spanish, kekchi))

### Build Gloss Dictionary

 * IBM Model 2 to extract common glosses (alignment-based translations):

In [None]:
from collections import defaultdict

def build_gloss_dict(ibm2, sentence_pairs, min_count=3):
    from collections import defaultdict
    gloss_dict = defaultdict(list)

    for spanish, kekchi in sentence_pairs:
        aligned = AlignedSent(kekchi, spanish)
        ibm2.align(aligned)
        for src_idx, tgt_idx in aligned.alignment:
            if src_idx is None or tgt_idx is None:
                continue
            if tgt_idx < len(spanish) and src_idx < len(kekchi):
                gloss_dict[spanish[tgt_idx]].append(kekchi[src_idx])

    cleaned_gloss = {}
    for word, matches in gloss_dict.items():
        if len(matches) >= min_count:
            cleaned_gloss[word] = max(set(matches), key=matches.count)

    return cleaned_gloss

### Construct IGT Gloss Triples and Format for Pretraining

In [None]:
import json

def make_igt_jsonl(ibm2, sentence_pairs, gloss_dict, output_path):
    with open(output_path, "w", encoding="utf-8") as f_out:
        for spanish, kekchi in sentence_pairs:
            aligned = AlignedSent(kekchi, spanish)
            ibm2.align(aligned)

            gloss_line = [gloss_dict.get(word, word) for word in spanish]

            input_text = "[SRC] " + " ".join(spanish) + " [GLOSS] " + " ".join(gloss_line)
            target_text = " ".join(kekchi)

            json_obj = {
                "input": input_text,
                "target": target_text
            }
            f_out.write(json.dumps(json_obj, ensure_ascii=False) + "\n")

### Load

In [None]:
sentence_pairs = load_aligned_sentences("/content/aligned_sentences_clean.txt")

In [None]:
sentence_pairs[:5]

[(['a',
   'continuación',
   'le',
   'pregunté',
   'y',
   'está',
   'usted',
   'dispuesto',
   'a',
   'pagar',
   'un',
   'precio',
   'tan',
   'alto',
   'por',
   'el',
   'evangelio'],
  ['laain',
   'xinpatz',
   're',
   'ma',
   'wan',
   'aachool',
   'chixtojbal',
   'xnimal',
   'li',
   'tzaq',
   'ain',
   'choq',
   're',
   'li',
   'evangelio']),
 (['el',
   'libro',
   'de',
   'mormón',
   'doctrina',
   'del',
   'evangelio',
   'manual',
   'para',
   'el',
   'maestro'],
  ['el',
   'libro',
   'de',
   'mormón',
   'doctrina',
   'del',
   'evangelio',
   'manual',
   'para',
   'el',
   'maestro']),
 (['el',
   'libro',
   'de',
   'mormón',
   'guía',
   'de',
   'estudio',
   'para',
   'el',
   'miembro',
   'de',
   'la',
   'clase'],
  ['el',
   'libro',
   'de',
   'mormón',
   'guía',
   'de',
   'estudio',
   'para',
   'el',
   'miembro',
   'de',
   'la',
   'clase']),
 (['sí',
   'abriré',
   'el',
   'corazón',
   'de',
   'los',
   'del',
   '

### Build Gloss Dict

In [None]:
gloss_dict = build_gloss_dict(ibm2, sentence_pairs, min_count=3)

In [None]:
list(gloss_dict.items())[:10]

[('evangelio', 'evangelio'),
 ('pregunté', 'ma'),
 ('doctrina', 'doctrina'),
 ('para', 'chi'),
 ('del', 'del'),
 ('manual', 'manual'),
 ('el', 'chi'),
 ('libro', 'mormon'),
 ('maestro', 'maestro'),
 ('mormón', 'mormon')]

### Write JSONL

In [None]:
make_igt_jsonl(ibm2, sentence_pairs, gloss_dict, "mbart_igt_pretrain_v2.jsonl")

### Output

In [None]:
!head mbart_igt_pretrain.jsonl

{"input": "[SRC] a continuación le pregunté y está usted dispuesto a pagar un precio tan alto por el evangelio [GLOSS] chi jo jesus ma chi chi laa chi chi chi chi precio chi chi chi chi evangelio", "target": "laain xinpatz re ma wan aachool chixtojbal xnimal li tzaq ain choq re li evangelio"}
{"input": "[SRC] el libro de mormón doctrina del evangelio manual para el maestro [GLOSS] chi mormon chi mormon doctrina del evangelio manual chi chi maestro", "target": "el libro de mormón doctrina del evangelio manual para el maestro"}
{"input": "[SRC] el libro de mormón guía de estudio para el miembro de la clase [GLOSS] chi mormon chi mormon guía chi chi chi chi miembro chi chi chi", "target": "el libro de mormón guía de estudio para el miembro de la clase"}
{"input": "[SRC] sí abriré el corazón de los del pueblo y te recibirán [GLOSS] chi chi chi chi chi chi del chi chi chi chi", "target": "relik chi yaal laain tinte xchooleb li tenamit ut eb aan tatexkul"}
{"input": "[SRC] manual 2 administr

## Train on Gloss Dictionary

### Function for all evaluation

In [None]:
!pip install evaluate
!pip install unbabel-comet
!pip install numpy

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.5.0-py3-none-any.whl (49

Collecting unbabel-comet
  Downloading unbabel_comet-2.2.5-py3-none-any.whl.metadata (19 kB)
Collecting entmax<2.0,>=1.1 (from unbabel-comet)
  Downloading entmax-1.3-py3-none-any.whl.metadata (348 bytes)
Collecting jsonargparse==3.13.1 (from unbabel-comet)
  Downloading jsonargparse-3.13.1-py3-none-any.whl.metadata (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy<2.0.0,>=1.20.0 (from unbabel-comet)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5.0.0,>=4.24.4 (from unbabel-comet)
  Downloading protobuf-4.25.6-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting pytorch-lightning<3.0.0,>=2.0.0 (from unbabel-comet)
  Downloading pytorch_lightning-2.5.1-py3-none-any.whl.metadata (2

In [None]:
!pip uninstall -y numpy
!pip install numpy --upgrade --force-reinstall

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m110.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
unbabel-comet 2.2.5 requires numpy<2.0.0,>=1.20.0, but you have numpy 2.2.4 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.4 which is incompatible.
numba 0.60.0 r

In [None]:
!pip install numpy==1.23.5

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m99.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.4
    Uninstalling numpy-2.2.4:
      Successfully uninstalled numpy-2.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.23.5 which is incompatible.
albucore 0.0.23 requires numpy>=1.24.4, but you have numpy 1.23.5 which is incompatible.
pymc 5.21.1 requires numpy>=1.25.0, but you have numpy 1.23.5 which is incompatible.
blosc2 3.2.1 requires nu

In [None]:
!pip install --force-reinstall evaluate unbabel-comet

Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting unbabel-comet
  Using cached unbabel_comet-2.2.5-py3-none-any.whl.metadata (19 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting numpy>=1.17 (from evaluate)
  Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting dill (from evaluate)
  Using cached dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from evaluate)
  Downloading pandas-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting requests>=2.19.0 (from evaluate)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.62.1 (from evaluate)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K    

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24986 sha256=5f7297fd5322b60118f12039d98d787a74cb92e9407851225b60371c1cb10513
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import numpy as np
from evaluate import load
from comet import download_model, load_from_checkpoint

def evaluate_mt_model(trainer, tokenizer, tokenized_test, raw_sources):
    print("🔍 Running model predictions...")
    preds = trainer.predict(tokenized_test)
    decoded_preds = tokenizer.batch_decode(preds.predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(preds.label_ids, skip_special_tokens=True)

    print("📏 Computing BLEU, ChrF++, ROUGE, and Exact Match...")
    bleu = load("sacrebleu")
    chrf = load("chrf")
    rouge = load("rouge")

    bleu_score = bleu.compute(predictions=decoded_preds,
                              references=[[ref] for ref in decoded_labels])["score"]
    chrf_score = chrf.compute(predictions=decoded_preds,
                              references=decoded_labels)["score"]
    rouge_score = rouge.compute(predictions=decoded_preds,
                                references=decoded_labels)["rougeL"]

    exact_matches = [pred.strip() == label.strip() for pred, label in zip(decoded_preds, decoded_labels)]
    exact_match_score = sum(exact_matches) / len(exact_matches) * 100

    print("🧠 Computing COMET score...")
    model_path = download_model("Unbabel/wmt22-comet-da")
    comet_model = load_from_checkpoint(model_path)

    comet_data = [
        {"src": raw_sources[i], "mt": decoded_preds[i], "ref": decoded_labels[i]}
        for i in range(len(decoded_preds))
    ]
    comet_score = comet_model.predict(comet_data, batch_size=8, gpus=1)
    comet_mean = np.mean(comet_score.scores)

    print("\n✅ Evaluation Summary:")
    print(f"BLEU:        {bleu_score:.2f}")
    print(f"ChrF++:      {chrf_score:.2f}")
    print(f"ROUGE-L:     {rouge_score:.2f}")
    print(f"Exact Match: {exact_match_score:.2f}%")
    print(f"COMET:       {comet_mean:.4f}")

    return {
        "BLEU": bleu_score,
        "ChrF++": chrf_score,
        "ROUGE-L": rouge_score,
        "Exact Match (%)": exact_match_score,
        "COMET": comet_mean
    }

### Load the JSONL into a HuggingFace Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="/content/mbart_igt_pretrain_v2.jsonl", split="train")
dataset = dataset.train_test_split(test_size=0.1)

train_data = dataset["train"]
test_data = dataset["test"]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
print(test_data.features)

{'input': Value(dtype='string', id=None), 'target': Value(dtype='string', id=None)}


In [None]:
train_data[:5]

{'input': ['[SRC] luego la hermana kristin m yee segunda consejera de la presidencia general de la sociedad de socorro ofrecerá la última oración [GLOSS] chi chi hermana kristin m yee segunda jo chi chi chi chi chi chi sociedad chi socorro elder chi jo chi',
  '[SRC] a salvar lo que se había perdido por michael t malm [GLOSS] chi dios chi chi chi chi chi chi michael t malm',
  '[SRC] por qué es bueno saber cocinar y desarrollar la autosuficiencia [GLOSS] chi ma chi us chi ha chi chi chi autosuficiencia',
  '[SRC] jesucristo también tiene la solución a este problema [GLOSS] jesucristo chi chi chi be chi chi laa',
  '[SRC] qué más aprendemos sobre maría [GLOSS] ma chi aprendemos chi maria'],
 'target': ['chirix aan taatijoq li hermana kristin m yee xkab aj tenq aj kehol naleb sa li jolomil awabejil re li komonil re tenqank',
  'xkolbal li karu sachbil xbaan laj michael t malm',
  'kaut naq aajel ru xnawbal kuubank ut xchaabilobresinkil li ilokib sa junesal',
  'rikin li jesukristo wan aj

In [None]:
test_data[:5]

{'input': ['[SRC] qué puede enseñarnos ella acerca de aceptar la voluntad de dios en cuanto a nosotros [GLOSS] ma chi chi chi chi chi chi chi chi chi dios chi jo chi chi',
  '[SRC] rudy es amable y compasivo y le dijo al gerente que estaba bien que tan solo repararan el daño y trajeran el mismo piano pero el gerente insistió en darnos uno nuevo [GLOSS] rudy chi mas chi a chi jesus jesus chi kay chi chi us chi chi chi repararan chi we chi trajeran chi chi piano chi chi kay elder chi jo chi chi',
  '[SRC] por ejemplo qué significa para ti la expresión disciplina y amonestación del señor [GLOSS] chi jo ma chi chi laa chi derechos disciplina chi chi del chi',
  '[SRC] no sabéis que hablo la verdad [GLOSS] chi ma chi at chi chi',
  '[SRC] nos impresionaron el contenido de su mensaje sobre evitar el pecado y la forma magistral en que utilizó el comportamiento común de los animales para enseñar una importante lección espiritual [GLOSS] chi impresionaron chi chi chi laa mensaje chi chi chi chi

### Load Pretrained Tokenizer & Model (e.g., mbart-large-50)

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/mbart-large-50"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

### Tokenize the Dataset

In [None]:
def preprocess_function(examples):
    inputs = ["translate Spanish to Kekchi: " + x for x in examples["input"]]
    targets = examples["target"]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/148412 [00:00<?, ? examples/s]

Map:   0%|          | 0/16491 [00:00<?, ? examples/s]

### ALL data

In [None]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,
    max_steps=3000,
    eval_steps=500,
    save_steps=500,
    save_total_limit=1,
    logging_steps=100,
    evaluation_strategy="steps",
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)



In [None]:
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test.select(range(1000)),
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model)
)

  trainer = Seq2SeqTrainer(


In [None]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.562,0.502834
1000,0.4178,0.389405
1500,0.3648,0.346551
2000,0.3459,0.335576
2500,0.3312,0.306289
3000,0.3394,0.32371




TrainOutput(global_step=3000, training_loss=0.5480665804545085, metrics={'train_runtime': 1109.0464, 'train_samples_per_second': 86.561, 'train_steps_per_second': 2.705, 'total_flos': 2.6005557215232e+16, 'train_loss': 0.5480665804545085, 'epoch': 0.6468305304010349})

### Evaluation Score

In [None]:
raw_sources = [
    x.split("[GLOSS]")[0].replace("[SRC]", "").strip()
    for x in test_data["input"]
]


evaluate_mt_model(trainer, tokenizer, tokenized_test.select(range(1000)), raw_sources[:1000])

🔍 Running model predictions...


📏 Computing BLEU, ChrF++, ROUGE, and Exact Match...
🧠 Computing COMET score...


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/2760a223ac957f30acfb18c8aa649b01cf1d75f2/checkpoints/model.ckpt`
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:p


✅ Evaluation Summary:
BLEU:        35.31
ChrF++:      60.67
ROUGE-L:     0.59
Exact Match: 2.70%
COMET:       0.7304


{'BLEU': 35.314076533764045,
 'ChrF++': 60.6727602053105,
 'ROUGE-L': 0.59301523340475,
 'Exact Match (%)': 2.7,
 'COMET': 0.7303610388338566}