#MT System Evaluation

Implemented metrics:

- BLEU score
- chrF3 (https://www.aclweb.org/anthology/W15-3049.pdf)
- COMET (https://github.com/Unbabel/COMET)

BLEU and chrF3 are computed using SACREBLEU (Post 2018)

Scores refer to corpus-level metrics


Results - baseline systems:

```
       System         |     BLEU     |   chrF3   |   COMET  
----------------------|--------------|-----------|-----------
ModernMT (base)       |     50.50    |   73.20   |   0.903
LLaMa 3.2 90b (base)  |     47.60    |   71.14   |   0.899
----------------------|--------------|-----------|----------
Difference            |     2.90     |   2.06    |   0.004

```
[As of 22nd Jan 2025]



Installing and importing libraries

In [None]:
!pip install sacrebleu
!pip install unbabel-comet

model_path = download_model("Unbabel/wmt22-comet-da")
model = load_from_checkpoint(model_path)


In [None]:
from sacrebleu import sentence_bleu, corpus_bleu, corpus_chrf, sentence_chrf
import pandas as pd
from comet import download_model, load_from_checkpoint
import re

Defining files

In [71]:
test_set = r'test_en.txt'
reference = r'test_it.txt'
nmt_base = r'baseline_mmt.txt'
llm_base = r'baseline_llama3_2_90b.txt'
#nmt_adp =
#llm_adp =
report = r'baseline_rep.csv'

## BLEU and chrF3



In [68]:
with open(test_set, "r", encoding="utf-8") as test:
    source = test.read().splitlines()
with open(reference, "r", encoding="utf-8") as refe:
    ref = refe.read().splitlines()
with open(nmt_base, "r", encoding="utf-8") as nmt_base:
    base_MMT = nmt_base.read().splitlines()
with open(llm_base, "r", encoding="utf-8") as llama_b: #dopo
    base_LLM = llama_b.read().splitlines()

print("BLEU NMT_b: ", corpus_bleu(base_MMT, [ref]).score)
print("chrF3  NMT_b: ", corpus_chrf(base_MMT, [ref], beta=3).score)
print("BLEU LLM_b: ", corpus_bleu(base_LLM, [ref]).score)
print("chrF3  LLM_b: ", corpus_chrf(base_LLM, [ref], beta=3).score)


BLEU NMT_b:  50.50687938590064
chrF3  NMT_b:  73.16738402926075
BLEU LLM_b:  47.60073478241307
chrF3  LLM_b:  71.14497579270626


## COMET

In [69]:
with open("test_en.txt") as f:
    srcs = [line.strip() for line in f]
with open("test_it.txt") as f:
    refs = [line.strip() for line in f]

# Initialize a dictionary to hold data for multiple systems
systems_data = {}

system_files = {
    "MMT base": "baseline_mmt.txt",
    "LLaMA 3.2 90B base": "baseline_llama3_2_90b.txt",
   # "system2": "another_system_output.txt",
    #"system3": "yet_another_system_output.txt"
}

for system_name, file_name in system_files.items():
    with open(file_name) as f:
        hyps = [line.strip() for line in f]
    # Create data structure for this system
    systems_data[system_name] = [
        {"src": srcs[idx], "mt": hyps[idx], "ref": refs[idx]}
        for idx in range(len(srcs))
    ]


In [70]:
# Dictionary to store model utputs for each system
model_outputs = {}

# Iterate over each system's data and make predictions
for system_name, system_data in systems_data.items():
    model_output = model.predict(system_data, batch_size=8, gpus=1)
    model_outputs[system_name] = model_output  # Store output for this system


INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 13/13 [03:45<00:00, 17.31s/it]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
Predicting DataLoader 0: 100%|██████████| 13/13 [03:42<00:00, 17.10s/it]


In [None]:
# Example access:
systems_data["MMT base"][:3]

In [72]:
# Iterate over the outputs for each system and display results
for system_name, model_output in model_outputs.items():
    print(f"Results for {system_name}:")
    print(model_output['scores'])  # Print individual scores
    print(f"System score: {model_output['system_score']:.3f}")
    print("-" * 50)  # Separator for readability

Results for MMT base:
[0.6509407758712769, 0.9253664612770081, 0.9338033199310303, 0.8771232962608337, 0.901055097579956, 0.7829745411872864, 0.8443666696548462, 0.9193286299705505, 0.9517034888267517, 0.87870854139328, 0.9108638763427734, 0.8949227929115295, 0.8849826455116272, 0.9678471684455872, 0.8225812315940857, 0.9178102016448975, 0.9431930780410767, 0.8903849720954895, 0.9240874648094177, 0.8671470284461975, 0.9930422902107239, 0.864596962928772, 0.9329361319541931, 0.9194691777229309, 0.9471927285194397, 0.845785915851593, 0.9178642630577087, 0.6264358162879944, 0.9470693469047546, 0.895145833492279, 0.9382466673851013, 0.8624492287635803, 0.8855338096618652, 0.9130049347877502, 0.9501826763153076, 0.8938094973564148, 0.944483757019043, 0.8913909196853638, 0.926358163356781, 0.8598359227180481, 0.9413009285926819, 0.8555892109870911, 0.929564893245697, 0.8802431225776672, 0.9571325182914734, 0.9166709780693054, 0.894931972026825, 0.9303080439567566, 0.8357870578765869, 0.95846

In [73]:
# Initialize columns for COMET scores
columns = [
    "source", "reference", "MMT (baseline)", "LLM (base)",
    "BLEU (MMT base)", "BLEU (LLM base)", "BLEU (difference base MMT/LLM)",
    "chrF3 (MMT base)", "chrF3 (LLM base)", "chrF3 (difference base MMT/LLM)",
    "COMET (MMT base)", "COMET (LLM base)", "COMET (difference base MMT/LLM)"
]

dataframe = pd.DataFrame(columns=columns)

for i in range(len(source)):

    BLEU_mmt_b = float("{:.3f}".format(sentence_bleu(base_MMT[i], [ref[i]], smooth_method='exp').score))
    BLEU_llm_b = float("{:.3f}".format(sentence_bleu(base_LLM[i], [ref[i]], smooth_method='exp').score))
    diff_BLEU_b = float("{:.3f}".format(float(BLEU_mmt_b) - float(BLEU_llm_b)))


    chrF3_mmt_b = float("{:.3f}".format(sentence_chrf(base_MMT[i], [ref[i]]).score))
    chrF3_llm_b = float("{:.3f}".format(sentence_chrf(base_LLM[i], [ref[i]], beta=3).score))
    diff_chrF3_b = float("{:.3f}".format(chrF3_mmt_b - chrF3_llm_b))


    comet_mmt_b = float("{:.3f}".format(model_outputs["MMT base"]["scores"][i]))
    comet_llm_b = float("{:.3f}".format(model_outputs["LLaMA 3.2 90B base"]["scores"][i]))
    diff_comet_b = float("{:.3f}".format(comet_mmt_b - comet_llm_b))


    dataframe.loc[i] = [
        source[i], ref[i], base_MMT[i], base_LLM[i],
        BLEU_mmt_b, BLEU_llm_b, diff_BLEU_b,
        chrF3_mmt_b, chrF3_llm_b, diff_chrF3_b,
        comet_mmt_b, comet_llm_b, diff_comet_b
    ]


dataframe.to_csv(report, index=False)
