In [1]:
import json
import evaluate

from typing import Any, Dict, List, Tuple
from tqdm import tqdm

import pandas as pd

In [2]:
# metrics:
bleu = evaluate.load("bleu")
sbleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

[nltk_data] Downloading package wordnet to /home/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/user/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/user/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
def format_distractors(distractors: List[str]) -> str:
    output = '; '.join([f'"{item}"' for item in distractors])
    return output

In [4]:
def compute_metrics(output: List[str], label_batch: List[str]) -> dict:
    metric_dict = {
        "bleu1": bleu.compute(predictions=output, references=[[label] for label in label_batch], max_order=1)["bleu"],
        "bleu2": bleu.compute(predictions=output, references=[[label] for label in label_batch], max_order=2)["bleu"],
        "bleu3": bleu.compute(predictions=output, references=[[label] for label in label_batch], max_order=3)["bleu"],
        "bleu4": bleu.compute(predictions=output, references=[[label] for label in label_batch], max_order=4)["bleu"],
        "sbleu": sbleu.compute(predictions=output, references=[[label] for label in label_batch])["score"],
        "meteor": meteor.compute(predictions=output, references=label_batch)["meteor"]
    }
    return metric_dict

In [5]:
def load_json(path: str) -> Any:
    with open(path, 'r', encoding="utf8") as inp:
        result = json.load(inp)
    return result

In [14]:
bdg = load_json("bartdg_output_ege_translated/bdg.json")
bdg_pm = load_json("bartdg_output_ege_translated/bdg_pm.json")
bdg_anpm = load_json("bartdg_output_ege_translated/bdg_anpm.json")

In [15]:
egedataset = load_json("EgeEvalDataset.json")

In [8]:
metrics = []
for item_orig, item_pred in zip(egedataset, bdg):
    metrics.append(
        compute_metrics(
            [format_distractors(item_pred["predicted_distractors"])],
            [format_distractors(item_orig["distractors"])]
        )
    )
metrics = pd.DataFrame(metrics)
metrics.describe()

Unnamed: 0,bleu1,bleu2,bleu3,bleu4,sbleu,meteor
count,55.0,55.0,55.0,55.0,55.0,55.0
mean,0.265726,0.195731,0.148246,0.109301,10.930132,0.27766
std,0.088162,0.069352,0.055105,0.042735,4.273479,0.078614
min,0.042763,0.030949,0.023821,0.016084,1.60843,0.092369
25%,0.198042,0.14487,0.110259,0.076802,7.680191,0.22431
50%,0.299906,0.207165,0.154185,0.111706,11.170589,0.288807
75%,0.330723,0.2399,0.185083,0.138334,13.833399,0.333384
max,0.460888,0.348264,0.284309,0.224967,22.496733,0.443554


In [9]:
compute_metrics(
    output = [format_distractors(item["predicted_distractors"]) for item in bdg],
    label_batch = [format_distractors(item["distractors"]) for item in egedataset]
)

{'bleu1': 0.2666272581836141,
 'bleu2': 0.19439065212712237,
 'bleu3': 0.14641594540354982,
 'bleu4': 0.10775226564025675,
 'sbleu': 10.775226564025676,
 'meteor': 0.2776595130987213}

In [10]:
metrics = []
for item_orig, item_pred in zip(egedataset, bdg_pm):
    metrics.append(
        compute_metrics(
            [format_distractors(item_pred["predicted_distractors"])],
            [format_distractors(item_orig["distractors"])]
        )
    )
metrics = pd.DataFrame(metrics)
metrics.describe()

Unnamed: 0,bleu1,bleu2,bleu3,bleu4,sbleu,meteor
count,55.0,55.0,55.0,55.0,55.0,55.0
mean,0.274673,0.196903,0.147538,0.108327,10.832707,0.284725
std,0.076482,0.060212,0.047969,0.037002,3.700188,0.073071
min,0.071407,0.054616,0.041623,0.029311,2.931054,0.08283
25%,0.233764,0.167415,0.120651,0.085065,8.50647,0.240794
50%,0.27451,0.196414,0.145232,0.106261,10.626057,0.28416
75%,0.324953,0.232272,0.176291,0.131544,13.154409,0.330493
max,0.404244,0.325409,0.254005,0.202335,20.233517,0.427524


In [11]:
compute_metrics(
    output = [format_distractors(item["predicted_distractors"]) for item in bdg_pm],
    label_batch = [format_distractors(item["distractors"]) for item in egedataset]
)

{'bleu1': 0.28522058256513055,
 'bleu2': 0.202466972819592,
 'bleu3': 0.15066064499303958,
 'bleu4': 0.11024783482444073,
 'sbleu': 11.024783482444072,
 'meteor': 0.28472548140884046}

In [17]:
metrics = []
for item_orig, item_pred in zip(egedataset, bdg_anpm):
    metrics.append(
        compute_metrics(
            [format_distractors(item_pred["predicted_distractors"])],
            [format_distractors(item_orig["distractors"])]
        )
    )
metrics = pd.DataFrame(metrics)
metrics.describe()

Unnamed: 0,bleu1,bleu2,bleu3,bleu4,sbleu,meteor
count,55.0,55.0,55.0,55.0,55.0,55.0
mean,0.266584,0.192738,0.143872,0.105455,10.545531,0.277794
std,0.087894,0.064485,0.050316,0.039484,3.948356,0.073711
min,0.077869,0.057911,0.045427,0.035759,3.57595,0.092857
25%,0.204116,0.147822,0.107723,0.076196,7.619609,0.224193
50%,0.271603,0.193462,0.144494,0.107501,10.750139,0.2784
75%,0.336708,0.230504,0.174354,0.126655,12.665536,0.329722
max,0.437277,0.336015,0.266129,0.210285,21.028489,0.456933


In [16]:
compute_metrics(
    output = [format_distractors(item["predicted_distractors"]) for item in bdg_anpm],
    label_batch = [format_distractors(item["distractors"]) for item in egedataset]
)

{'bleu1': 0.2739325756628323,
 'bleu2': 0.19706366903228534,
 'bleu3': 0.14638702529794567,
 'bleu4': 0.1070538228616483,
 'sbleu': 10.705382286164834,
 'meteor': 0.27779385962026554}

In [20]:
df_new_bdg = [
    {
        "article": item_orig["reading_text"],
        "question": item_orig["question"],
        "right_answer": item_orig["right_answer"],
        "distractors": format_distractors(item_orig["distractors"]),
        "output": format_distractors(item_pred["predicted_distractors"])
    } for item_orig, item_pred in zip(egedataset, bdg)
]
df_new_bdg = pd.DataFrame(df_new_bdg)
df_new_bdg.to_excel("../output_analysis/OrganizedOutputData/BartDG-EGE/BartDGOutputEGE.xlsx", engine="openpyxl")

df_new_bdg_pm = [
    {
        "article": item_orig["reading_text"],
        "question": item_orig["question"],
        "right_answer": item_orig["right_answer"],
        "distractors": format_distractors(item_orig["distractors"]),
        "output": format_distractors(item_pred["predicted_distractors"])
    } for item_orig, item_pred in zip(egedataset, bdg_pm)
]
df_new_bdg_pm = pd.DataFrame(df_new_bdg_pm)
df_new_bdg_pm.to_excel("../output_analysis/OrganizedOutputData/BartDG-EGE/BartDG_PM_OutputEGE.xlsx", engine="openpyxl")

df_new_bdg_anpm = [
    {
        "article": item_orig["reading_text"],
        "question": item_orig["question"],
        "right_answer": item_orig["right_answer"],
        "distractors": format_distractors(item_orig["distractors"]),
        "output": format_distractors(item_pred["predicted_distractors"])
    } for item_orig, item_pred in zip(egedataset, bdg_anpm)
]
df_new_bdg_anpm = pd.DataFrame(df_new_bdg_anpm)
df_new_bdg_anpm.to_excel("../output_analysis/OrganizedOutputData/BartDG-EGE/BartDG_ANPM_OutputEGE.xlsx", engine="openpyxl")