# Organize all of the benchmarks:

In [None]:
pip install unbabel-comet

In [28]:
# from sacrebleu.metrics import BLEU, CHRF 
# def evaluate_translation_bleu(input_file, translated_file, reference_file):
#     # Function to read a file and extract non-blank lines
#     def read_file(file_path):
#         with open(file_path, 'r', encoding='utf-8') as file:
#             lines = [line.strip() for line in file if line.strip()]
#         return lines

#     # Read the files
#     input_lines = read_file(input_file)
#     translated_lines = read_file(translated_file)
#     reference_lines = [read_file(reference_file)]  # Note the list wrapping for multiple references support
#     #print(translated_lines[1]) # TODO remove
#     #print(reference_lines[0][1]) # TODO remove

#     # Initialize the BLEU object
#     bleu = BLEU()
#     chrf = CHRF() 

#     # Compute the BLEU score
#     score = bleu.corpus_score(translated_lines, reference_lines)
#     score2 = chrf.corpus_score(translated_lines, reference_lines)

#     # Print and return the BLEU score and its detailed breakdown
#     #print(f"Bleu Score: {score.score}")
#     #print(f"CHRF Score: {score2.score}")
#     #print(f"Full report 1: {score}")
#     #print(f"Full report 2: {score2}")
#     return (score.score, score2.score) 
from sacrebleu.metrics import BLEU, CHRF

def evaluate_translation_bleu(input_file, translated_file, reference_file):
    # Function to read a file and extract non-blank lines
    def read_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = [line.strip() for line in file if line.strip()]
        return lines

    # Function to split a list into thirds
    def split_into_thirds(lst):
        n = len(lst)
        third = n // 3
        return lst[:third], lst[third:2*third], lst[2*third:]

    # Read the files
    input_lines = read_file(input_file)
    translated_lines = read_file(translated_file)
    reference_lines = [read_file(reference_file)]  # Note the list wrapping for multiple references support

    # Split the data into thirds
    input_thirds = split_into_thirds(input_lines)
    translated_thirds = split_into_thirds(translated_lines)
    reference_thirds = [split_into_thirds(ref) for ref in reference_lines]

    # Initialize the BLEU and CHRF objects
    bleu = BLEU()
    chrf = CHRF()

    # Function to compute scores for a given set of lines
    def compute_scores(translated, reference):
        bleu_score = bleu.corpus_score(translated, reference).score
        chrf_score = chrf.corpus_score(translated, reference).score
        return bleu_score, chrf_score

    # Compute scores for each third and the whole dataset
    scores = {
        "whole_dataset": compute_scores(translated_lines, reference_lines),
        "first_third": compute_scores(translated_thirds[0], [ref[0] for ref in reference_thirds]),
        "second_third": compute_scores(translated_thirds[1], [ref[1] for ref in reference_thirds]),
        "third_third": compute_scores(translated_thirds[2], [ref[2] for ref in reference_thirds])
    }

    return scores

# Example usage
# scores = evaluate_translation_bleu('input.txt', 'translated.txt', 'reference.txt')
# print(scores)

In [35]:
# from comet import download_model, load_from_checkpoint
# import torch

# def evaluate_translation_comet(input_file, translated_file, reference_file):
#     # Function to read a file and extract non-blank lines
#     def read_file(file_path):
#         with open(file_path, 'r', encoding='utf-8') as file:
#             lines = [line.strip() for line in file if line.strip()]
#         return lines

#     # Read the files
#     input_lines = read_file(input_file)
#     translated_lines = read_file(translated_file)
#     reference_lines = read_file(reference_file)  # No need to wrap in a list for COMET

#     # Initialize the COMET model
#     model_path = download_model("Unbabel/wmt22-comet-da")
#     comet_model = load_from_checkpoint(model_path)

#     # Prepare data for COMET
#     data = [{"src": src, "mt": mt, "ref": ref} for src, mt, ref in zip(input_lines, translated_lines, reference_lines)]

#     # Compute COMET scores
#     #print("DATA BEING TESTED:", data) 
#     model_output = comet_model.predict(data, gpus=0)

#     # Print COMET scores
#     print(f"COMET output:", model_output) 
#     return model_output
from comet import download_model, load_from_checkpoint
import torch

def evaluate_translation_comet(input_file, translated_file, reference_file):
    # Function to read a file and extract non-blank lines
    def read_file(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            lines = [line.strip() for line in file if line.strip()]
        return lines

    # Function to split a list into thirds
    def split_into_thirds(lst):
        n = len(lst)
        third = n // 3
        return lst[:third], lst[third:2*third], lst[2*third:]

    # Function to calculate the average of a list
    def calculate_average(lst):
        return sum(lst) / len(lst) if lst else 0

    # Read the files
    input_lines = read_file(input_file)
    translated_lines = read_file(translated_file)
    reference_lines = read_file(reference_file)  # No need to wrap in a list for COMET

    # Initialize the COMET model
    model_path = download_model("Unbabel/wmt22-comet-da")
    comet_model = load_from_checkpoint(model_path)

    # Prepare data for COMET
    data = [{"src": src, "mt": mt, "ref": ref} for src, mt, ref in zip(input_lines, translated_lines, reference_lines)]

    # Compute COMET scores
    model_output = comet_model.predict(data, gpus=0)
    scores = model_output['scores']

    # Split scores into thirds
    first_third, second_third, third_third = split_into_thirds(scores)

    # Calculate average scores for each third and the whole dataset
    average_scores = {
        "whole_dataset": calculate_average(scores),
        "first_third": calculate_average(first_third),
        "second_third": calculate_average(second_third),
        "third_third": calculate_average(third_third)
    }

    return average_scores


## DeepL Evaluation 

### Generate Benchmarks

In [39]:
deepL_jpen_chrf_bleu = evaluate_translation_bleu(input_file='model_outputs/test/jp_to_en/in.txt', translated_file='model_outputs/test/jp_to_en/deepL/out.txt', reference_file='model_outputs/test/jp_to_en/out.txt')
deepL_jpen_comet = evaluate_translation_comet(input_file='model_outputs/test/jp_to_en/in.txt', translated_file='model_outputs/test/jp_to_en/deepL/out.txt', reference_file='model_outputs/test/jp_to_en/out.txt')
deepL_enjp_chrf_bleu = evaluate_translation_bleu(input_file='model_outputs/test/en_to_jp/in.txt', translated_file='model_outputs/test/en_to_jp/deepL/out.txt', reference_file='model_outputs/test/en_to_jp/out.txt')
deepL_enjp_comet = evaluate_translation_comet(input_file='model_outputs/test/en_to_jp/in.txt', translated_file='model_outputs/test/en_to_jp/deepL/out.txt', reference_file='model_outputs/test/en_to_jp/out.txt')

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Predicting DataLoader 0: 100%|██| 57/57

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Predicting DataLoader 0: 100%|██| 57/57

### Make Table

In [43]:
from tabulate import tabulate

def generate_translation_quality_table(enjp_bleu_chrf, jpen_bleu_chrf, enjp_comet, jpen_comet):
    # Define the headers for the table
    headers = ["Segment", "EN-JP BLEU", "EN-JP CHRF", "JP-EN BLEU", "JP-EN CHRF", "EN-JP COMET", "JP-EN COMET"]

    # Define default values for missing dictionaries
    default_bleu_chrf = {"whole_dataset": (0, 0), "first_third": (0, 0), "second_third": (0, 0), "third_third": (0, 0)}
    default_comet = {"whole_dataset": 0, "first_third": 0, "second_third": 0, "third_third": 0}

    # Use default values if any of the dictionaries are None
    enjp_bleu_chrf = enjp_bleu_chrf if enjp_bleu_chrf is not None else default_bleu_chrf
    jpen_bleu_chrf = jpen_bleu_chrf if jpen_bleu_chrf is not None else default_bleu_chrf
    enjp_comet = enjp_comet if enjp_comet is not None else default_comet
    jpen_comet = jpen_comet if jpen_comet is not None else default_comet

    # Extract the segments (keys) from one of the dictionaries (they should all have the same keys)
    segments = enjp_bleu_chrf.keys()

    # Prepare the rows for the table
    rows = []
    for segment in segments:
        enjp_bleu, enjp_chrf = enjp_bleu_chrf[segment]
        jpen_bleu, jpen_chrf = jpen_bleu_chrf[segment]
        enjp_comet_score = enjp_comet[segment]
        jpen_comet_score = jpen_comet[segment]
        rows.append([segment, enjp_bleu, enjp_chrf, jpen_bleu, jpen_chrf, enjp_comet_score, jpen_comet_score])

    # Generate the table using tabulate
    table = tabulate(rows, headers=headers, tablefmt="grid")

    return table

In [44]:
print(generate_translation_quality_table(deepL_enjp_chrf_bleu, deepL_jpen_chrf_bleu, deepL_enjp_comet, deepL_jpen_comet))

+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| Segment       |   EN-JP BLEU |   EN-JP CHRF |   JP-EN BLEU |   JP-EN CHRF |   EN-JP COMET |   JP-EN COMET |
| whole_dataset |  0.000101534 |      35.0454 |      17.4314 |      48.864  |      0.657306 |      0.772182 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| first_third   |  2.38586e-05 |      32.9298 |      19.5677 |      44.3947 |      0.668454 |      0.758157 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| second_third  |  0.0154445   |      32.9994 |      14.0065 |      42.6844 |      0.688727 |      0.748618 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| third_third   |  1.91163e-08 |      37.4417 |      17.6079 |      55.4434 |      0.614737 |      0.809771 |
+---------

## ALMA-R Evaluation 

### Generate Benchmarks 

In [45]:
almar_jpen_chrf_bleu = evaluate_translation_bleu(input_file='model_outputs/test/jp_to_en/in.txt', translated_file='model_outputs/test/jp_to_en/ALMA-R/out3.txt', reference_file='model_outputs/test/jp_to_en/out.txt')
almar_jpen_comet = evaluate_translation_comet(input_file='model_outputs/test/jp_to_en/in.txt', translated_file='model_outputs/test/jp_to_en/ALMA-R/out3.txt', reference_file='model_outputs/test/jp_to_en/out.txt')
# NOTE: enjp values N/A on the 7B param variant of ALMA-R, outputs german 

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Predicting DataLoader 0: 100%|██| 57/57

### Make Table 

In [47]:
print(generate_translation_quality_table(None, almar_jpen_chrf_bleu, None, almar_jpen_comet))

+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| Segment       |   EN-JP BLEU |   EN-JP CHRF |   JP-EN BLEU |   JP-EN CHRF |   EN-JP COMET |   JP-EN COMET |
| whole_dataset |            0 |            0 |      9.4352  |      39.0808 |             0 |      0.741773 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| first_third   |            0 |            0 |      9.40995 |      32.6018 |             0 |      0.731449 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| second_third  |            0 |            0 |      9.04974 |      39.2609 |             0 |      0.720376 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| third_third   |            0 |            0 |      9.69305 |      43.7714 |             0 |      0.773494 |
+---------

## M4T Model Evaluations 

### M4TV1

In [48]:
m4tV1_jpen_chrf_bleu = evaluate_translation_bleu(input_file='model_outputs/test/jp_to_en/in.txt', translated_file='model_outputs/test/jp_to_en/m4tv1/out.txt', reference_file='model_outputs/test/jp_to_en/out.txt')
m4tV1_jpen_comet = evaluate_translation_comet(input_file='model_outputs/test/jp_to_en/in.txt', translated_file='model_outputs/test/jp_to_en/m4tv1/out.txt', reference_file='model_outputs/test/jp_to_en/out.txt')
m4tV1_enjp_chrf_bleu = evaluate_translation_bleu(input_file='model_outputs/test/en_to_jp/in.txt', translated_file='model_outputs/test/en_to_jp/m4tv1/out.txt', reference_file='model_outputs/test/en_to_jp/out.txt')
m4tV1_enjp_comet = evaluate_translation_comet(input_file='model_outputs/test/en_to_jp/in.txt', translated_file='model_outputs/test/en_to_jp/m4tv1/out.txt', reference_file='model_outputs/test/en_to_jp/out.txt')

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Predicting DataLoader 0: 100%|██| 57/57

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Predicting DataLoader 0: 100%|██| 57/57

In [50]:
print(generate_translation_quality_table(m4tV1_enjp_chrf_bleu, m4tV1_jpen_chrf_bleu, m4tV1_enjp_comet, m4tV1_jpen_comet))

+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| Segment       |   EN-JP BLEU |   EN-JP CHRF |   JP-EN BLEU |   JP-EN CHRF |   EN-JP COMET |   JP-EN COMET |
| whole_dataset |   0.117905   |      20.8832 |     11.4358  |      40.4847 |      0.629256 |      0.696393 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| first_third   |   0.0798192  |      13.003  |      8.1234  |      29.5013 |      0.62253  |      0.620249 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| second_third  |   0.515763   |      22.0577 |      9.04282 |      34.6015 |      0.659661 |      0.672046 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| third_third   |   0.00140051 |      25.4382 |     15.527   |      51.6501 |      0.605578 |      0.796883 |
+---------

### M4Tv2 

In [52]:
m4tV2_jpen_chrf_bleu = evaluate_translation_bleu(input_file='model_outputs/test/jp_to_en/in.txt', translated_file='model_outputs/test/jp_to_en/m4tv2/out.txt', reference_file='model_outputs/test/jp_to_en/out.txt')
m4tV2_jpen_comet = evaluate_translation_comet(input_file='model_outputs/test/jp_to_en/in.txt', translated_file='model_outputs/test/jp_to_en/m4tv2/out.txt', reference_file='model_outputs/test/jp_to_en/out.txt')
m4tV2_enjp_chrf_bleu = evaluate_translation_bleu(input_file='model_outputs/test/en_to_jp/in.txt', translated_file='model_outputs/test/en_to_jp/m4tv2/out.txt', reference_file='model_outputs/test/en_to_jp/out.txt')
m4tV2_enjp_comet = evaluate_translation_comet(input_file='model_outputs/test/en_to_jp/in.txt', translated_file='model_outputs/test/en_to_jp/m4tv2/out.txt', reference_file='model_outputs/test/en_to_jp/out.txt')

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Predicting DataLoader 0: 100%|██| 57/57

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Predicting DataLoader 0: 100%|██| 57/57

In [53]:
print(generate_translation_quality_table(m4tV2_enjp_chrf_bleu, m4tV2_jpen_chrf_bleu, m4tV2_enjp_comet, m4tV2_jpen_comet))

+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| Segment       |   EN-JP BLEU |   EN-JP CHRF |   JP-EN BLEU |   JP-EN CHRF |   EN-JP COMET |   JP-EN COMET |
| whole_dataset |   0.141062   |      19.4128 |      8.20461 |      35.622  |      0.616959 |      0.643702 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| first_third   |   0.205474   |      11.9155 |      5.63931 |      25.3349 |      0.606557 |      0.562447 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| second_third  |   0.521246   |      19.9775 |      5.32697 |      29.2089 |      0.64763  |      0.611855 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| third_third   |   0.00333794 |      24.025  |     11.9595  |      46.6148 |      0.59669  |      0.756806 |
+---------

## Google Translate Evaluations 

In [55]:
gTranslate_jpen_chrf_bleu = evaluate_translation_bleu(input_file='model_outputs/test/jp_to_en/in.txt', translated_file='model_outputs/test/jp_to_en/google_translate/out.txt', reference_file='model_outputs/test/jp_to_en/out.txt')
gTranslate_jpen_comet = evaluate_translation_comet(input_file='model_outputs/test/jp_to_en/in.txt', translated_file='model_outputs/test/jp_to_en/google_translate/out.txt', reference_file='model_outputs/test/jp_to_en/out.txt')
gTranslate_enjp_chrf_bleu = evaluate_translation_bleu(input_file='model_outputs/test/en_to_jp/in.txt', translated_file='model_outputs/test/en_to_jp/google_translate/out.txt', reference_file='model_outputs/test/en_to_jp/out.txt')
gTranslate_enjp_comet = evaluate_translation_comet(input_file='model_outputs/test/en_to_jp/in.txt', translated_file='model_outputs/test/en_to_jp/google_translate/out.txt', reference_file='model_outputs/test/en_to_jp/out.txt')

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Predicting DataLoader 0: 100%|██| 57/57

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.4. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/Users/thomaspett/Desktop/projects/MT_senior_thesis_repo/env/lib/python3.10/site-packages/pytorch_lightning/trainer/setup.py:187: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
Predicting DataLoader 0: 100%|██| 57/57

In [56]:
print(generate_translation_quality_table(gTranslate_enjp_chrf_bleu, gTranslate_jpen_chrf_bleu, gTranslate_enjp_comet, gTranslate_jpen_comet))

+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| Segment       |   EN-JP BLEU |   EN-JP CHRF |   JP-EN BLEU |   JP-EN CHRF |   EN-JP COMET |   JP-EN COMET |
| whole_dataset |  0.0258947   |      34.031  |      19.6339 |      51.888  |      0.65223  |      0.793666 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| first_third   |  0.736565    |      26.5313 |      23.5871 |      49.3255 |      0.63421  |      0.784536 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| second_third  |  0.00402613  |      34.6235 |      17.0647 |      47.6336 |      0.695957 |      0.780795 |
+---------------+--------------+--------------+--------------+--------------+---------------+---------------+
| third_third   |  9.03664e-06 |      38.7212 |      17.1729 |      56.1235 |      0.626522 |      0.815668 |
+---------