In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from scipy.stats import bootstrap
from collections import Counter, defaultdict
import copy
import seaborn as sns
import matplotlib.patches as mpatches
import ast
project_root_dir = "../"

import os
work_dir = os.path.abspath(project_root_dir)

import sys
sys.path.append(project_root_dir)


import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
logging.getLogger('wandb').setLevel(logging.WARNING)
logging.getLogger('urllib3').setLevel(logging.WARNING)


from src.utils import get_linearization_class
from src.datamodules import IEGenericOutputDataset
from src.metrics import TSF1, TSPrecision, TSRecall
import src.utils.evaluation_helpers as evaluation_helpers

import pandas as pd
import numpy as np
from IPython.display import display, HTML
from pprint import pprint

lc = get_linearization_class("fully_expanded")

In [2]:
%%html
<style>
.dataframe td {
    white-space: nowrap;
}
</style>

In [3]:
# ~~~ Prameters to set ~~~

ORDERED_METRICS = ['P', 'R', 'F1']

bootstrap_n = 150
# target_col_prediction_col_pairs = [("gt_majority", "lin_triplet_set_rebel"), ("gt_majority", "lin_triplet_set_synthie"), ("gt_any", "lin_triplet_set_rebel"), ("gt_any", "lin_triplet_set_synthie")]
target_col_prediction_col_pairs = [("gt_majority", "lin_triplet_set_rebel"), ("gt_majority", "lin_triplet_set_synthie"), ("gt_majority", "lin_triplet_set_synthie_base"), ("gt_majority", "lin_triplet_set_genie_base")]
# target_col_prediction_col_pairs = [("gt_majority", "lin_triplet_set_synthie_base"), ("gt_majority", "lin_triplet_set_genie_base")]

model_name2col = {
    "GenIE{\\footnotesize~{T5-base}}": "lin_triplet_set_genie_base", 
    "SynthIE{\\footnotesize~{T5-base}}": "lin_triplet_set_synthie_base", 
    "SynthIE{\\footnotesize~{T5-large}}": "lin_triplet_set_synthie", 
    "REBEL{\\footnotesize~{Gold}}":"lin_triplet_set_rebel"
}

ground_truth_cols_to_present = ["gt_majority"]
discard_datapoints_with_no_triplets_in_gt = True

In [4]:
def process_df(df, suffix=""):
    gt_majoritys = []
    first_workers = []
    second_workers = []
    third_workers = []
    gt_any_worker = []

    for triplet_set, gt_majority, first_worker, second_worker, third_worker in zip(df[f"lin_triplet_set{suffix}"], df[f"gt_majority{suffix}"], df[f"first_worker{suffix}"], df[f"second_worker{suffix}"], df[f"third_worker{suffix}"]):
        triplets = lc.text_to_triplet_list(triplet_set, return_set=False)
        gt_majority = [triplets[int(idx)] for idx in sorted(ast.literal_eval(gt_majority))]

        any_worker = [triplets[int(idx)] for idx in sorted(ast.literal_eval(first_worker) + ast.literal_eval(second_worker) + ast.literal_eval(third_worker))]

        first_worker = [triplets[int(idx)] for idx in sorted(ast.literal_eval(first_worker))]
        second_worker = [triplets[int(idx)] for idx in sorted(ast.literal_eval(second_worker))]
        third_worker = [triplets[int(idx)] for idx in sorted(ast.literal_eval(third_worker))]

        gt_majoritys.append(lc.triplet_list_to_text(gt_majority)[0])
        first_workers.append(lc.triplet_list_to_text(first_worker)[0])
        second_workers.append(lc.triplet_list_to_text(second_worker)[0])
        third_workers.append(lc.triplet_list_to_text(third_worker)[0])
        gt_any_worker.append(lc.triplet_list_to_text(any_worker)[0])

    df[f"gt_majority{suffix}"] = gt_majoritys
    df[f"gt_any{suffix}"] = gt_any_worker
    df[f"first_worker{suffix}"] = first_workers
    df[f"second_worker{suffix}"] = second_workers
    df[f"third_worker{suffix}"] = third_workers

def processed_merged_df(df, suffixes):
    for suffix in suffixes:
        process_df(df, suffix=suffix)

    gt_majorities = []
    gt_anys = []

    for _, row in df.iterrows():
        gt_majority_rebel = row["gt_majority_rebel"]
        gt_majority_synthie = row["gt_majority_synthie"]
        
        gt_majority = set(list(lc.text_to_triplet_list(gt_majority_rebel, verbose=False)) + list(lc.text_to_triplet_list(gt_majority_synthie, verbose=False)))
        gt_majorities.append(lc.triplet_list_to_text(gt_majority)[0])

        gt_any_rebel = row["gt_any_rebel"]
        gt_any_synthie = row["gt_any_synthie"]

        gt_any = set(list(lc.text_to_triplet_list(gt_any_rebel, verbose=False)) + list(lc.text_to_triplet_list(gt_any_synthie, verbose=False)))
        gt_anys.append(lc.triplet_list_to_text(sorted(gt_any))[0])


    df["gt_majority"] = gt_majorities
    df["gt_any"] = gt_anys

def filter_df(df, ids_to_keep=None, ids_to_drop=None):
    assert ids_to_keep is not None or ids_to_drop is not None and not (ids_to_keep is not None and ids_to_drop is not None)

    if ids_to_keep is not None:
        return df[df.id.isin(ids_to_keep)].reset_index(drop=True)
    
    if ids_to_drop is not None:
        return df[~df.id.isin(ids_to_drop)].reset_index(drop=True)

In [5]:
suffixes = ["_rebel", "_synthie"]

def get_gt_majority_cols(suffixes=suffixes):
    gt_majority_cols = ["id", "input", 'gt_majority']
    for suffix in suffixes:
        gt_majority_cols.append(f"gt_majority{suffix}")
    return gt_majority_cols

def get_all_annotations(suffixes, show_majority, show_majority_per_suffix):
    all_annotations = ["first_worker", "second_worker", "third_worker"]
    
    if show_majority:
        cols = ["id", "input", 'gt_majority']
    else:
        cols = ["id", "input"]

    for suffix in suffixes:
        if show_majority_per_suffix:
            cols.append(f"gt_majority{suffix}")
        
        for col in all_annotations:
            cols.append(f"{col}{suffix}")

    return cols

def wrap_df_text_results(df, cols_to_show=['input', 'lin_triplet_set_rebel', 'lin_triplet_set_synthie'], input_max_col=70):
    tdf = df.copy()
    tdf = tdf[cols_to_show]
    tdf['input'] = tdf['input'].str.wrap(input_max_col)
        
    for col in tdf.columns:
        if col == "id":
            continue
        if type(tdf[col][0]) == str:
            tdf[col] = tdf[col].str.replace("\[e\]|\[et\]", "[e]\\n", regex=True)
        else:
            tdf[col] = tdf[col].apply(lambda x: "\\n".join([re.sub("\[e\]|\[et\]", "[e]\\n", p) for p in x]))

    display(HTML(tdf.to_html().replace("\\n","<br>")))

def show_df(df, cols_to_show=['input', 'lin_triplet_set_rebel', 'lin_triplet_set_synthie'], input_max_col=70):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, "display.expand_frame_repr", False, 'display.max_colwidth', None, 'display.width', None): 
        wrap_df_text_results(df, cols_to_show, input_max_col)

In [6]:
def get_output_dataset(df, target_col, prediction_col):
    ids = df["id"].to_list()
    lin_targets = df[target_col].to_list()
    lin_preds = df[prediction_col].to_list()

    # get a list of id, target, prediction dicts per datapoint
    data = []
    for id, lin_target, lin_pred in zip(ids, lin_targets, lin_preds):
        data.append({
            "id": id,
            "target": lin_target,
            "prediction": lin_pred
        })

    output_dataset = IEGenericOutputDataset(data=data, linearization_class_id="fully_expanded", seed=123)
    return output_dataset

def get_df_from_wandb_run(wandb_run_path, ids_to_keep, suffix):
    wandb_run_config, wandb_run_hydra_config, abs_exp_dir = evaluation_helpers.prepare_data_for_experiment(wandb_run_path, work_dir=project_root_dir, log_func=print)
    print("Loading the output dataset corresponding to run:", wandb_run_hydra_config['run_name'])
    linearization_class_id = wandb_run_hydra_config["datamodule"].get("linearization_class_id")

    output_dataset_parameters = {"data_dir": os.path.join(abs_exp_dir, "predictions"), "linearization_class_id": linearization_class_id, "seed": 123}
    output_dataset = IEGenericOutputDataset(**output_dataset_parameters)
    data = [dp for dp in output_dataset.data if dp['id'] in ids_to_keep]

    ids = [dp['id'] for dp in data]
    input = [dp['input'] for dp in data]
    target = [dp['target'] for dp in data]
    lin_triplet_set = [dp['prediction'][0] for dp in data]
    
    df = pd.DataFrame(list(zip(ids, input, target, lin_triplet_set)), columns =['id', 'input', 'lin_triplet_set_rebel', f'lin_triplet_set{suffix}'])
    return df


get_metric_name_from_metric = lambda metric, macro: metric.name + "_macro" if macro else metric.name 
get_metric_name = lambda metric_name, macro: metric_name + "_macro" if macro else metric_name

def compute_metric(metric_name, output_dataset, seed=None, macro=False):
    assert metric_name in [TSPrecision.name, TSRecall.name, TSF1.name]
    
    if metric_name == TSPrecision.name:
        metric = TSPrecision()
    elif metric_name == TSRecall.name:
        metric = TSRecall()
    elif metric_name == TSF1.name:
        metric = TSF1()

    if seed is None:
        macro_metadata_dict = None

        if macro:
            macro_metadata_dict = evaluation_helpers.get_macro_metrics_computation_metadata(
                output_dataset, consider_prediction_triplets=True
            )

        return metric.compute_from_dataset(output_dataset, bucket_metadata_dict=macro_metadata_dict)[1]
    
    original_data = output_dataset.data
    output_dataset.data = output_dataset.get_bootstrapped_data(seed=seed)

    macro_metadata_dict = None
    if macro:
        macro_metadata_dict = evaluation_helpers.get_macro_metrics_computation_metadata(
            output_dataset, consider_prediction_triplets=True
        )   
    score = metric.compute_from_dataset(output_dataset, bucket_metadata_dict=macro_metadata_dict)[1]
    
    output_dataset.data = original_data
    
    return score

def compute_metrics(output_dataset, macro=False, scores={}, bootstrap_n=None, metric_names = [TSPrecision.name, TSRecall.name, TSF1.name]):
    if bootstrap_n is None:
        for metric_name in metric_names:
            scores[get_metric_name(metric_name, macro)] = compute_metric(metric_name, output_dataset, macro=macro)
        return scores

    for metric_name in metric_names:
        seed = 100
        bootstrap_scores = []

        for i in range(bootstrap_n):
            s = compute_metric(metric_name, output_dataset=output_dataset, seed=seed, macro=macro)
            bootstrap_scores.append(s)
            seed += 1
        
        scores[get_metric_name(metric_name, macro)] = evaluation_helpers.get_std_based_ci(bootstrap_scores)

    return scores

# Task 2

### Massaging the Data

In [7]:
df = pd.read_csv("human_eval_data/mturk_task2_results.csv")
df = df.iloc[:, 1:]

# get a single row per datapoint
df_rebel_data = df[df.dataset == "rebel"]
df_rebel_data = df_rebel_data.drop(columns=["dataset"])
df_rebel_data = df_rebel_data.drop(columns=["triplets_formatted"])
df_rebel_data.columns = ["id", "lin_triplet_set", "input", "gt_majority", "num_workers", "first_worker", "second_worker", "third_worker"]

df_synthie_data = df[df.dataset == "model"]
df_synthie_data = df_synthie_data.drop(columns=["dataset"])
df_synthie_data = df_synthie_data.drop(columns=["triplets_formatted"])
df_synthie_data.columns = ["id", "lin_triplet_set", "input", "gt_majority", "num_workers", "first_worker", "second_worker", "third_worker"]

df_merged = pd.merge(df_rebel_data, df_synthie_data, on=["id", "input", "num_workers"], how="inner", suffixes=("_rebel", "_synthie"))
df_merged['input'] = df_merged['input'].apply(lambda x: x.replace('\\"', '"')).apply(lambda x: x.replace("\\'", "'")) # unescape quotes
processed_merged_df(df_merged, ["_rebel", "_synthie"])
if discard_datapoints_with_no_triplets_in_gt:
    df_merged = df_merged[df_merged['gt_majority'] != u""].reset_index(drop=True)
datapoint_ids = set(df_merged.id.to_list())

In [8]:
# include GenIE base data
genie_base_wandb_run_path = 'epfl-dlab/SynthIE/2h4ibqlg'
genie_base_suffix = "_genie_base"

genie_base_df = get_df_from_wandb_run(genie_base_wandb_run_path, datapoint_ids, genie_base_suffix)
df_merged = pd.merge(df_merged, genie_base_df, on=["id", "input", "lin_triplet_set_rebel"], how="inner", suffixes=("", ""))
assert len(df_merged) == len(datapoint_ids)

Experiment directory already exists: ../logs/inference/runs/inf_full_fe_rebel_ms_base_medium_lr_datamodule-rebel_world-genie_t5_tokenizeable_split-test_constraint-R-max-T5_lp-0.8/2023-02-02_05-56-12
Loading the existing results
Loading the output dataset corresponding to run: inf_full_fe_rebel_ms_base_medium_lr_datamodule-rebel_world-genie_t5_tokenizeable_split-test_constraint-R-max-T5_lp-0.8
INFO:src.datamodules.ie_generic:[Output DS] Loaded the predictions for 114953 datapoints from ../logs/inference/runs/inf_full_fe_rebel_ms_base_medium_lr_datamodule-rebel_world-genie_t5_tokenizeable_split-test_constraint-R-max-T5_lp-0.8/2023-02-02_05-56-12/predictions
INFO:src.datamodules.ie_generic:[Output DS] Dataset statistics: {'num_datapoints': 114953, 'num_triplets': {'P': 251898, 'T': 257129}, 'num_unique_entities': {'P': 137455, 'T': 151997}, 'num_unique_relations': {'P': 389, 'T': 580}}


In [9]:
# include SynthIE-base data
synthie_base_wandb_run_path = 'epfl-dlab/SynthIE/2mlj6x38'
synthie_base_suffix = "_synthie_base"

synthie_base_df = get_df_from_wandb_run(synthie_base_wandb_run_path, datapoint_ids, synthie_base_suffix)
df_merged = df_merged.merge(synthie_base_df, on=["id", "input", "lin_triplet_set_rebel"], how="inner", suffixes=("", ""))
assert len(df_merged) == len(datapoint_ids)

Experiment directory already exists: ../logs/inference/runs/inf_full_fe_fully_synthetic_ms_base_medium_lr_datamodule-rebel_world-genie_t5_tokenizeable_split-test_constraint-R-max-T5_lp-0.8/2023-02-02_22-09-02
Loading the existing results
Loading the output dataset corresponding to run: inf_full_fe_fully_synthetic_ms_base_medium_lr_datamodule-rebel_world-genie_t5_tokenizeable_split-test_constraint-R-max-T5_lp-0.8
INFO:src.datamodules.ie_generic:[Output DS] Loaded the predictions for 114953 datapoints from ../logs/inference/runs/inf_full_fe_fully_synthetic_ms_base_medium_lr_datamodule-rebel_world-genie_t5_tokenizeable_split-test_constraint-R-max-T5_lp-0.8/2023-02-02_22-09-02/predictions
INFO:src.datamodules.ie_generic:[Output DS] Dataset statistics: {'num_datapoints': 114953, 'num_triplets': {'P': 375052, 'T': 257129}, 'num_unique_entities': {'P': 158537, 'T': 151997}, 'num_unique_relations': {'P': 836, 'T': 580}}


### Computing Performance

In [10]:
scores = {}
for target_col, prediction_col in target_col_prediction_col_pairs:
    curr_scores = {}
    
    output_dataset = get_output_dataset(df_merged, target_col, prediction_col)

    curr_scores = compute_metrics(output_dataset, scores=curr_scores, bootstrap_n=bootstrap_n)
    curr_scores = compute_metrics(output_dataset, macro=True, scores=curr_scores, bootstrap_n=bootstrap_n)
    
    scores[(target_col, prediction_col)] = curr_scores

# pprint(scores)

INFO:src.datamodules.ie_generic:[Output DS] Got 49 datapoints from the `data` passed to the constructor
INFO:src.datamodules.ie_generic:[Output DS] Dataset statistics: {'num_datapoints': 49, 'num_triplets': {'P': 125, 'T': 150}, 'num_unique_entities': {'P': 147, 'T': 169}, 'num_unique_relations': {'P': 39, 'T': 55}}
INFO:src.datamodules.ie_generic:[Output DS] Got 49 datapoints from the `data` passed to the constructor
INFO:src.datamodules.ie_generic:[Output DS] Dataset statistics: {'num_datapoints': 49, 'num_triplets': {'P': 165, 'T': 150}, 'num_unique_entities': {'P': 187, 'T': 169}, 'num_unique_relations': {'P': 72, 'T': 55}}
INFO:src.datamodules.ie_generic:[Output DS] Got 49 datapoints from the `data` passed to the constructor
INFO:src.datamodules.ie_generic:[Output DS] Dataset statistics: {'num_datapoints': 49, 'num_triplets': {'P': 173, 'T': 150}, 'num_unique_entities': {'P': 192, 'T': 169}, 'num_unique_relations': {'P': 72, 'T': 55}}
INFO:src.datamodules.ie_generic:[Output DS] Go

### Presentation

In [11]:
def _process_scores(scores):
    processed_scores = []

    for s in scores:
        if isinstance(s, float):
            processed_scores.append(f"{s*100:.2f}")
        else:
            error = (s[1] - s[0] + s[2] - s[1]) / 2
            processed_scores.append(f"{s[0]*100:.2f} {{\\scriptsize± {error*100:.2f}}}")

    return processed_scores

def get_latex_table(model_name2col, ground_truth_cols, macro):
    gt_cols = []
    model_names = []
    precs = []
    recs = []
    f1s = []

    for gt_col in ground_truth_cols:
        for model_name in model_name2col:
            model_names.append(model_name)
            gt_cols.append(gt_col)
            precs.append(scores[(gt_col, model_name2col[model_name])][get_metric_name_from_metric(TSPrecision, macro)])
            recs.append(scores[(gt_col, model_name2col[model_name])][get_metric_name_from_metric(TSRecall, macro)])
            f1s.append(scores[(gt_col, model_name2col[model_name])][get_metric_name_from_metric(TSF1, macro)])

    precs = _process_scores(precs)
    recs = _process_scores(recs)
    f1s = _process_scores(f1s)

    df = pd.DataFrame({"gt_col": gt_cols, "model_name": model_names, "P": precs, "R": recs, "F1": f1s})

    df = df.set_index(["gt_col", "model_name"])
    df = df[ORDERED_METRICS]

    if len(ground_truth_cols) == 1:
        df.index = df.index.droplevel([0])
        df.index = [f"\\hspace{{4mm}} {x}" for x in df.index]

    display(df)
    return df

print("~~~ Micro ~~~")
micro_df = get_latex_table(model_name2col, ground_truth_cols_to_present, macro=False)
print(micro_df.style.to_latex())
print("~~~ Macro ~~~")
macro_df = get_latex_table(model_name2col, ground_truth_cols_to_present, macro=True)
print(macro_df.style.to_latex())


~~~ Micro ~~~


Unnamed: 0,P,R,F1
\hspace{4mm} GenIE{\footnotesize~{T5-base}},44.96 {\scriptsize± 10.62},38.88 {\scriptsize± 8.66},43.09 {\scriptsize± 8.02}
\hspace{4mm} SynthIE{\footnotesize~{T5-base}},35.35 {\scriptsize± 10.26},43.53 {\scriptsize± 9.08},40.13 {\scriptsize± 8.62}
\hspace{4mm} SynthIE{\footnotesize~{T5-large}},41.58 {\scriptsize± 11.08},48.81 {\scriptsize± 8.60},46.35 {\scriptsize± 8.44}
\hspace{4mm} REBEL{\footnotesize~{Gold}},69.88 {\scriptsize± 8.66},57.50 {\scriptsize± 8.00},64.47 {\scriptsize± 6.89}


\begin{tabular}{llll}
 & P & R & F1 \\
\hspace{4mm} GenIE{\footnotesize~{T5-base}} & 44.96 {\scriptsize± 10.62} & 38.88 {\scriptsize± 8.66} & 43.09 {\scriptsize± 8.02} \\
\hspace{4mm} SynthIE{\footnotesize~{T5-base}} & 35.35 {\scriptsize± 10.26} & 43.53 {\scriptsize± 9.08} & 40.13 {\scriptsize± 8.62} \\
\hspace{4mm} SynthIE{\footnotesize~{T5-large}} & 41.58 {\scriptsize± 11.08} & 48.81 {\scriptsize± 8.60} & 46.35 {\scriptsize± 8.44} \\
\hspace{4mm} REBEL{\footnotesize~{Gold}} & 69.88 {\scriptsize± 8.66} & 57.50 {\scriptsize± 8.00} & 64.47 {\scriptsize± 6.89} \\
\end{tabular}

~~~ Macro ~~~


Unnamed: 0,P,R,F1
\hspace{4mm} GenIE{\footnotesize~{T5-base}},29.43 {\scriptsize± 8.40},28.57 {\scriptsize± 7.69},26.74 {\scriptsize± 7.85}
\hspace{4mm} SynthIE{\footnotesize~{T5-base}},33.52 {\scriptsize± 7.25},35.70 {\scriptsize± 7.28},32.88 {\scriptsize± 6.82}
\hspace{4mm} SynthIE{\footnotesize~{T5-large}},40.80 {\scriptsize± 7.37},40.79 {\scriptsize± 6.95},39.19 {\scriptsize± 6.78}
\hspace{4mm} REBEL{\footnotesize~{Gold}},45.52 {\scriptsize± 7.97},44.40 {\scriptsize± 7.70},43.55 {\scriptsize± 7.75}


\begin{tabular}{llll}
 & P & R & F1 \\
\hspace{4mm} GenIE{\footnotesize~{T5-base}} & 29.43 {\scriptsize± 8.40} & 28.57 {\scriptsize± 7.69} & 26.74 {\scriptsize± 7.85} \\
\hspace{4mm} SynthIE{\footnotesize~{T5-base}} & 33.52 {\scriptsize± 7.25} & 35.70 {\scriptsize± 7.28} & 32.88 {\scriptsize± 6.82} \\
\hspace{4mm} SynthIE{\footnotesize~{T5-large}} & 40.80 {\scriptsize± 7.37} & 40.79 {\scriptsize± 6.95} & 39.19 {\scriptsize± 6.78} \\
\hspace{4mm} REBEL{\footnotesize~{Gold}} & 45.52 {\scriptsize± 7.97} & 44.40 {\scriptsize± 7.70} & 43.55 {\scriptsize± 7.75} \\
\end{tabular}

