<a href="https://colab.research.google.com/github/lucia-galiero/TICO-19_NMT_LLM/blob/main/eval_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#MT System Evaluation

Implemented metrics:

- BLEU score
- chrF3 (https://www.aclweb.org/anthology/W15-3049.pdf)
- COMET (https://github.com/Unbabel/COMET)

BLEU and chrF3 are computed using SACREBLEU (Post 2018)

Scores refer to corpus-level metrics


Results - baseline systems:

```
       System         |     BLEU     |   chrF3   |   COMET  
----------------------|--------------|-----------|-----------
ModernMT (base)       |     50.50    |   73.20   |   0.903
MondernMT (adapted)   |     50.77    |   73.85   |   0.911
------------------------------------------------------------
LLaMa 3.2 90b (base)  |     47.60    |   71.14   |   0.899
LLaMa 3.2 90b (2-S)   |     48.60    |   72.39   |   0.907


```
[As of 30th Jan 2025]



Installing and importing libraries

In [None]:
!pip install sacrebleu
!pip install unbabel-comet

from comet import download_model, load_from_checkpoint
model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1
Collecting unbabel-comet
  Downloadin

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/3.38k [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/9.69k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

hparams.yaml:   0%|          | 0.00/567 [00:00<?, ?B/s]

model.ckpt:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/f49d328952c3470eff6bb6f545d62bfdb6e66304/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from sacrebleu import sentence_bleu, corpus_bleu, corpus_chrf, sentence_chrf
import pandas as pd
from comet import download_model, load_from_checkpoint
import re

Defining files

In [None]:
test_set = r'test_en.txt'
reference = r'test_it.txt'
nmt_base = r'baseline_mmt.txt'
llm_base = r'baseline_llama3_2_90b.txt'
nmt_adp = r'adapted_mmt.txt'
llm_adp =r'MAN_GROQ_2-s_llama_3_2_90B.txt'
report = r'baseline_rep.csv'

## BLEU and chrF3



In [None]:

with open(reference, "r", encoding="utf-8") as ref_file:
    ref = ref_file.read().splitlines()
with open(nmt_base, "r", encoding="utf-8") as nmt_base_file:
    base_MMT = nmt_base_file.read().splitlines()
with open(nmt_adp, "r", encoding="utf-8") as nmt_adp_file:
    adap_MMT = nmt_adp_file.read().splitlines()
with open(llm_base, "r", encoding="utf-8") as llm_base_file:
    base_LLM = llm_base_file.read().splitlines()
with open(llm_adp, "r", encoding="utf-8") as llm_adp_file:
    adap_LLM = llm_adp_file.read().splitlines()


print("BLEU NMT_b: ", corpus_bleu(base_MMT, [ref]).score)
print("chrF3  NMT_b: ", corpus_chrf(base_MMT, [ref], beta=3).score)
print("BLEU NMT_a: ", corpus_bleu(adap_MMT, [ref]).score)
print("chrF3  NMT_a: ", corpus_chrf(adap_MMT, [ref], beta=3).score)
print("BLEU LLM_b: ", corpus_bleu(base_LLM, [ref]).score)
print("chrF3  LLM_b: ", corpus_chrf(base_LLM, [ref], beta=3).score)
print("BLEU LLM_a: ", corpus_bleu(adap_LLM, [ref]).score)
print("chrF3  LLM_a: ", corpus_chrf(adap_LLM, [ref], beta=3).score)

BLEU NMT_b:  50.50687938590064
chrF3  NMT_b:  73.16738402926075
BLEU NMT_a:  50.76907856115449
chrF3  NMT_a:  73.85479484694643
BLEU LLM_b:  47.60073478241307
chrF3  LLM_b:  71.14497579270626
BLEU LLM_a:  48.601634420377856
chrF3  LLM_a:  72.39204333099953


## COMET

In [None]:
with open("test_en.txt") as f:
    srcs = [line.strip() for line in f]
with open("test_it.txt") as f:
    refs = [line.strip() for line in f]

# Initialize a dictionary to hold data for multiple systems
systems_data = {}

system_files = {
    "MMT base": "baseline_mmt.txt",
    "MMT adapted": "adapted_mmt.txt",
    "LLaMA 3.2 90B base": "baseline_llama3_2_90b.txt",
    "LLaMA 3.2 90B 2-shot": "MAN_GROQ_2-s_llama_3_2_90B.txt"
}

for system_name, file_name in system_files.items():
    with open(file_name) as f:
        hyps = [line.strip() for line in f]
    # Create data structure for this system
    systems_data[system_name] = [
        {"src": srcs[idx], "mt": hyps[idx], "ref": refs[idx]}
        for idx in range(len(srcs))
    ]


In [None]:
# Dictionary to store model utputs for each system
model_outputs = {}

# Iterate over each system's data and make predictions
for system_name, system_data in systems_data.items():
    model_output = model.predict(system_data, batch_size=8, gpus=1)
    model_outputs[system_name] = model_output  # Store output for this system


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 13/13 [03:34<00:00, 16.53s/it]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 13/13 [03:26<00:00, 15.90s/it]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 13/13 [03:28<00:00, 16.08s/it]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: Fal

In [None]:
# Example access:
systems_data["MMT base"][:3]

In [None]:
# Iterate over the outputs for each system and display results
for system_name, model_output in model_outputs.items():
    print(f"Results for {system_name}:")
    print(model_output['scores'])  # Print individual scores
    print(f"System score: {model_output['system_score']:.3f}")
    print("-" * 50)  # Separator for readability

Results for MMT base:
[0.6509407758712769, 0.9253664612770081, 0.9338033199310303, 0.8771232962608337, 0.901055097579956, 0.7829745411872864, 0.8443666696548462, 0.9193286299705505, 0.9517034888267517, 0.87870854139328, 0.9108638763427734, 0.8949227929115295, 0.8860388398170471, 0.9678471684455872, 0.8225812315940857, 0.9178102016448975, 0.9431930780410767, 0.8903849720954895, 0.9240874648094177, 0.8665159344673157, 0.9930422902107239, 0.864596962928772, 0.9329361319541931, 0.9194691777229309, 0.9471927285194397, 0.845785915851593, 0.9178642630577087, 0.6264358162879944, 0.9470693469047546, 0.895145833492279, 0.9382466673851013, 0.8624492287635803, 0.8855338096618652, 0.9130049347877502, 0.9501826763153076, 0.8938094973564148, 0.944483757019043, 0.8913909196853638, 0.926358163356781, 0.8598359227180481, 0.9413009285926819, 0.8555892109870911, 0.929564893245697, 0.8802431225776672, 0.9571325182914734, 0.9166709780693054, 0.894931972026825, 0.9303080439567566, 0.8357870578765869, 0.95846

In [None]:

columns = [
    "source", "reference", "MMT (baseline)", "MMT (adapted)","LLM (base)",
    "BLEU (MMT base)", "BLEU (MMT adapted)", "BLEU (LLM base)", #"BLEU (difference  MMTa & b)", "BLEU (difference base MMT/LLM)
    "chrF3 (MMT base)","chrF3 (MMT adapted)" , "chrF3 (LLM base)", # "chrF3 (difference MMTa & b)","chrF3 (difference base MMT/LLM)",
    "COMET (MMT base)", "COMET (MMT adapted)", "COMET (LLM base)" #"COMET (difference MMTa & b)" , "COMET (difference base MMT/LLM)"
]

dataframe = pd.DataFrame(columns=columns)

for i in range(len(source)):

    BLEU_mmt_b = float("{:.3f}".format(sentence_bleu(base_MMT[i], [ref[i]], smooth_method='exp').score))
    BLEU_mmt_a = float("{:.3f}".format(sentence_bleu(adap_MMT[i], [ref[i]], smooth_method='exp').score))
    BLEU_llm_b = float("{:.3f}".format(sentence_bleu(base_LLM[i], [ref[i]], smooth_method='exp').score))
    #diff_BLEU_a = float("{:.3f}".format(float(BLEU_mmt_a) - float(BLEU_mmt_b)))
    #diff_BLEU_b = float("{:.3f}".format(float(BLEU_mmt_b) - float(BLEU_llm_b)))



    chrF3_mmt_b = float("{:.3f}".format(sentence_chrf(base_MMT[i], [ref[i]]).score))
    chrF3_mmt_a = float("{:.3f}".format(sentence_chrf(adap_MMT[i], [ref[i]]).score))
    chrF3_llm_b = float("{:.3f}".format(sentence_chrf(base_LLM[i], [ref[i]], beta=3).score))
    #diff_chrF3_a = float("{:.3f}".format(chrF3_mmt_a - chrF3_mmt_b))
    #diff_chrF3_b = float("{:.3f}".format(chrF3_mmt_b - chrF3_llm_b))


    comet_mmt_b = float("{:.3f}".format(model_outputs["MMT base"]["scores"][i]))
    comet_mmt_a = float("{:.3f}".format(model_outputs["MMT adapted"]["scores"][i]))
    comet_llm_b = float("{:.3f}".format(model_outputs["LLaMA 3.2 90B base"]["scores"][i]))
    #diff_comet_a = float("{:.3f}".format(comet_mmt_a - comet_mmt_b))
    #diff_comet_b = float("{:.3f}".format(comet_mmt_b - comet_llm_b))

    dataframe.loc[i] = [
        source[i], ref[i], base_MMT[i], adap_MMT[i], base_LLM[i],
        BLEU_mmt_a, BLEU_mmt_b, BLEU_llm_b,  #diff_BLEU_a, diff_BLEU_b,
        chrF3_mmt_b, chrF3_mmt_a, chrF3_llm_b, #diff_chrF3_b, diff_chrF3_a,
        comet_mmt_b, comet_mmt_a, comet_llm_b, #diff_comet_a, diff_comet_b
    ]


dataframe.to_csv(report, index=False)
