#### Imports

In [35]:
project_root_dir = ".."

import os
work_dir = os.path.abspath(project_root_dir)

import sys
sys.path.append(project_root_dir)

import wandb
import re

import pandas as pd
from IPython.display import display, HTML
from functools import reduce

import numpy as np

from src.utils import general_helpers
from src.utils import evaluation_helpers
from src.datamodules import IEGenericOutputDataset
from src.utils import get_linearization_class

import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint

sns.set_theme(style="whitegrid")
sns.set_style("ticks")
sns.set_palette("deep")

In [6]:
%%html
<style>
.dataframe td {
    white-space: nowrap;
}
</style>

## The Part to Update

In [7]:
# Helper for getting the desired wandb run names and wandb paths

def get_runs(prefix, lp, constraint, datamodule=None):
    api = wandb.Api()
    entity, project = "epfl-dlab", "SynthIE"  # set to your entity and project 
    runs = api.runs(entity + "/" + project) 
    run_name2wandb_path = {}
    for run in runs:
        if run.name.startswith(prefix) and f"constraint-{constraint}" in run.name and f"lp-{lp}" in run.name:
            if datamodule is None or f"datamodule-{datamodule}" in run.name:
                run_name2wandb_path[run.name] = f"{entity}/{project}/{run.id}"

    return run_name2wandb_path

# prefixes = ["inf_fe_fully_synthetic_gcp_large_last", "inf_sc_fully_synthetic_gcp", "inf_full_fe_"]
prefixes = ["inf_fe_fully_synthetic_gcp_large_last"]
constraints = ["R-max-T5"]
lps = ["0.8"]
datamodules = ["rebel"]
for prefix in prefixes:
    print(f"Prefix: {prefix}")
    run_name2wandb_path = {}
    for constraint in constraints:
        for lp in lps:
            for datamodule in datamodules:
                run_name2wandb_path.update(get_runs(prefix, lp, constraint, datamodule))
    pprint(run_name2wandb_path)
    print()

Prefix: inf_fe_fully_synthetic_gcp_large_last
{'inf_fe_fully_synthetic_gcp_large_last_datamodule-rebel_world-genie_t5_tokenizeable_split-test_small_constraint-R-max-T5_lp-0.8': 'epfl-dlab/SynthIE/1ot3pbs0'}



In [8]:
run_name2wandb_path = {'inf_fe_fully_synthetic_gcp_large_last_datamodule-rebel_world-genie_t5_tokenizeable_split-test_small_constraint-R-max-T5_lp-0.8': 'epfl-dlab/SynthIE/1ot3pbs0'}

# Useful for the process_results launcher
print(" ".join(list(run_name2wandb_path.values())[::-1]))

epfl-dlab/SynthIE/1ot3pbs0


In [9]:
# ~~~ Which experiments to consider? ~~~
# model_id_wandb_run_path_pairs = [("R", "martinj96/SynthIE/izr0vgpw"), ("R+S", "martinj96/SynthIE/2f857wrl"), ("R", "martinj96/SynthIE/3274th6j"), ("R+S", "martinj96/SynthIE/runs/22djwbxk")]
def get_model_id(run_name):
    _id = ""
    if "_fully_synthetic_" in run_name:
        _id += "SynthIE"
    
    if "_rebel_" in run_name:
        _id += "GenIE"

    if "_large_" in run_name:
        _id += "(large)"
    else:
        _id += "(base)"
    
    if "_fe_" in run_name:
        _id += "-FE"

    if "_sc_" in run_name:
        _id += "-SC"
    
    return _id

model_id_wandb_run_path_pairs = [(get_model_id(run_name), wandb_path) for run_name, wandb_path in run_name2wandb_path.items()][::-1]
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ~~~ What to report? ~~~~
PRESENT_QUANTITATIVE_RESULTS = False
PRESENT_MACRO_RESULTS = True

PRESENT_QUALITATIVE_RESULTS = True
SHOW_INPUT_ONLY = True
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ~~~ Qauntitative results ~~~
metrics = ["triplet_set_precision", "triplet_set_recall", "triplet_set_f1"]

n_bootstrap_samples = 10  # Set to None to present only the corpus level metric (without confidence intervals)
confidence_level = 0.95
confidence_interval_type = "std"
assert confidence_interval_type in ["percentile", "std"]

metric_name2label = {"triplet_set_precision": "P", "triplet_set_recall": "R", "triplet_set_f1": "F1"}
dataset_name2dataset_label = {"rebel": "REBEL", "sdg_code_davinci_002": "[SDG] Code Davinci", "sdg_text_davinci_003": "[SGD] Text Davinci"}
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# ~~~ Qualitative results ~~~
n_samples = 300
n_predictions_to_show = 1
barplot_capsize = 0

# Note that if you change the below flags, the presented predictions will correspond to a processed version of the actual (exact) output
present_triplets_in_the_canonical_linearization = False # the canonical linearization is fully_expanded
keep_duplicate_triplets = True
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~

##### Define Experiment ID

In [10]:
# ~~~ Optional ~~~~
def retrieve_constraint_module_id(hydra_config):
    cm_cfg = hydra_config['model']['constraint_module']

    if cm_cfg == None or cm_cfg == 'None':
        return "free"

    identifier2label = {'genie': 'R-max'}
    return identifier2label.get(cm_cfg['identifier'], cm_cfg['identifier'])

def retrieve_lenght_penalty(hydra_config):
    return hydra_config['model']['hparams_overrides']['inference']['hf_generation_params']['length_penalty']
# ~~~~~~~~~~~~~~~~~~

# The names of these functions should not change    
def get_exp_id(model_id, hydra_config):
    return f"{model_id} -- ({retrieve_constraint_module_id(hydra_config)}) lp_{retrieve_lenght_penalty(hydra_config)} [{dataset_name2dataset_label[retrieve_dataset_name(hydra_config)]}]"

def retrieve_dataset_name(hydra_config):
    return hydra_config['datamodule']['name']

results_df_field2getter = {"LP": retrieve_lenght_penalty, "Constraint": retrieve_constraint_module_id}
barplot_hue_ordering = ['Model', 'LP', 'Constraint']
results_ordering = barplot_hue_ordering

## Main Part

In [11]:
wandb_configs = {}
hydra_configs = {}
abs_exp_dirs = {}

output_datasets = {}
results = {}

exp_id2wandb_run_path = {}
exp_id2model_id = {}

for model_id, wandb_run_path in model_id_wandb_run_path_pairs:
    api = wandb.Api()
    run = api.run(wandb_run_path)
    wandb_config, hydra_config, abs_exp_dir = evaluation_helpers.prepare_data_for_experiment(wandb_run_path, work_dir, print)
    exp_id = get_exp_id(model_id, hydra_config)
    exp_id2wandb_run_path[exp_id] = wandb_run_path
    exp_id2model_id[exp_id] = model_id

    wandb_configs[exp_id], hydra_configs[exp_id], abs_exp_dirs[exp_id] = wandb_config, hydra_config, abs_exp_dir

    # Load predictions data
    linearization_class_id = hydra_configs[exp_id]['datamodule'].get("linearization_class_id", None)
    if linearization_class_id is None:
        # Left for backward compatibility
        print("Linearization class ID not specified. Using the default one `fully_expanded_et`")
        linearization_class_id = "fully_expanded_et"
    data_dir = os.path.join(abs_exp_dirs[exp_id], "predictions")
    output_datasets[exp_id] = IEGenericOutputDataset(data_dir=data_dir, seed=123, linearization_class_id=linearization_class_id)

    # Load existing results data
    results[exp_id] = evaluation_helpers.Results(abs_exp_dirs[exp_id])

exp_ids = list(results.keys())

Experiment directory already exists: /Users/josifosk/Documents/PhD/SynthIE_main/logs/inference/runs/inf_fe_fully_synthetic_gcp_large_last_datamodule-rebel_world-genie_t5_tokenizeable_split-test_small_constraint-R-max-T5_lp-0.8/2023-01-29_23-09-11
Loading the existing results


In [12]:
def display_results(metrics, metric_labels):
    # Retrieve information
    results_dfs = []
    for exp_id in exp_id2wandb_run_path.keys():
        curr_results = results[exp_id]
        
        if n_bootstrap_samples is None:
            y = [curr_results.get_score(metric_id, per_bucket=False) for metric_id in metrics]
            yerr = None
        else:
            y = []
            yerr = []
            for metric_id in metrics:
                # print(metric_id)
                if confidence_interval_type == "percentile":
                    lower, mean, upper = curr_results.get_percentile_based_ci(metric_id, 
                                                                              confidence_level=confidence_level,
                                                                              n_bootstrap_samples=n_bootstrap_samples,
                                                                              dataset_id=None)
                    y.append(mean)
                    yerr.append((mean-lower, upper-mean))
                elif confidence_interval_type == "std":
                    lower, mean, upper = curr_results.get_std_based_ci(metric_id, 
                                                                       n_bootstrap_samples=n_bootstrap_samples, 
                                                                       dataset_id=None)
                    y.append(mean)
                    yerr.append((mean-lower, upper-mean))
                else:
                    raise ValueError(f"Unknown confidence interval type: {confidence_interval_type}")

            yerr = np.array(yerr).T

        # create a datataframe
        if yerr is not None:
            df = pd.DataFrame({'Metric': metric_labels, 'Score': y, 'ci_score_lower': yerr[0, :], 'ci_score_upper': yerr[1, :]})
        else:
            df = pd.DataFrame({'Metric': metric_labels, 'Score': y})
        
        # required fields
        df['Exp_ID'] = [exp_id] * df.shape[0]
        df['Dataset'] = [retrieve_dataset_name(hydra_configs[exp_id])] * df.shape[0]
        df['Model'] = [exp_id2model_id[exp_id]] * df.shape[0]

        # optional fields
        if results_df_field2getter is not None:
            for field_name, getter in results_df_field2getter.items():
                df[field_name] = [getter(hydra_configs[exp_id])] * df.shape[0]
        results_dfs.append(df)

    results_df = pd.concat(results_dfs, axis=0)
    
    # Massage the data
    dataset_names = results_df['Dataset'].unique()

    per_dataset_results_long = {}
    per_dataset_results_wide = {}

    per_dataset_results_ci_lower = {}
    per_dataset_results_ci_upper = {}

    for dataset_name in dataset_names:
        dataset_long_df = results_df[results_df['Dataset'] == dataset_name].copy()
        # set the error term to the average of the empirical lower and upper confidence interval delta
        # dataset_long_df['Error'] = ((dataset_long_df['ci_score_lower'] + dataset_long_df['ci_score_upper']) / 2)
        # dataset_long_df['Score_with_ci'] = (dataset_long_df['Score']*100).round(2).astype(str) + ' ± ' + (dataset_long_df['Error']*100).round(2).astype(str)
        if n_bootstrap_samples is not None:
            dataset_long_df['Score_with_ci'] = (dataset_long_df['Score']*100).round(2).astype(str) + " [" + ((dataset_long_df['Score']-dataset_long_df['ci_score_lower'])*100).round(2).astype(str) + ", " + ((dataset_long_df['Score'] + dataset_long_df['ci_score_upper'])*100).round(2).astype(str) + "]"
            scores_column = 'Score_with_ci'
        else:
            dataset_long_df['Score_without_ci'] =  (dataset_long_df['Score']*100).round(2).astype(str)
            scores_column = 'Score_without_ci'
        
        per_dataset_results_long[dataset_name] = dataset_long_df

        pivot_index = ['Model']
        if results_df_field2getter is not None:
            pivot_index.extend(list(results_df_field2getter.keys()))
        dataset_wide_df = dataset_long_df.pivot_table(index=pivot_index, columns='Metric', values=scores_column, aggfunc=lambda x: ' '.join(x))
        dataset_wide_df = dataset_wide_df[['P', 'R', 'F1']]
        per_dataset_results_wide[dataset_name] = dataset_wide_df

    results_wide_df = pd.concat(per_dataset_results_wide.values(), keys=per_dataset_results_wide.keys(), names=['Dataset'], axis=1)
    results_wide_df = results_wide_df.sort_index(axis=0, level=results_ordering)
    results_wide_df.columns = pd.MultiIndex.from_tuples([(dataset_name2dataset_label[dataset_name], metric) for dataset_name, metric in results_wide_df.columns.values], names=['Dataset', f'Metric [{int(confidence_level*100)}% CI]'])
    
    display(results_wide_df)

if PRESENT_QUANTITATIVE_RESULTS:
    print("~~~ Micro ~~~")
    metric_labels = [metric_name2label[metric_id] for metric_id in metrics]
    display_results(metrics, metric_labels)

    print("~~~ Macro ~~~~")
    if PRESENT_MACRO_RESULTS:
        macro_metrics = [f"macro_{metric}" for metric in metrics]
        display_results(macro_metrics, metric_labels)

In [13]:
def display_barplots(metrics, metric_labels):
    # Retrieve information
    results_dfs = []
    for exp_id in exp_id2wandb_run_path.keys():
        curr_results = results[exp_id]
        
        if n_bootstrap_samples is None:
            y = [curr_results.get_score(metric_id, per_bucket=False) for metric_id in metrics]
            yerr = None
        else:
            y = []
            yerr = []
            for metric_id in metrics:
                # print(metric_id)
                if confidence_interval_type == "percentile":
                    lower, mean, upper = curr_results.get_percentile_based_ci(metric_id, 
                                                                              confidence_level=confidence_level,
                                                                              n_bootstrap_samples=n_bootstrap_samples,
                                                                              dataset_id=None)
                elif confidence_interval_type == "std":
                    lower, mean, upper = curr_results.get_std_based_ci(metric_id, 
                                                                       n_bootstrap_samples=n_bootstrap_samples, 
                                                                       dataset_id=None)
                else:
                    raise ValueError(f"Unknown confidence interval type: {confidence_interval_type}")
                y.append(mean)
                yerr.append((mean-lower, upper-mean))

            yerr = np.array(yerr).T

        # create a datataframe
        if yerr is not None:
            df = pd.DataFrame({'Metric': metric_labels, 'Score': y, 'ci_score_lower': yerr[0, :], 'ci_score_upper': yerr[1, :]})
        else:
            df = pd.DataFrame({'Metric': metric_labels, 'Score': y})
        
        # required fields
        df['Exp_ID'] = [exp_id] * df.shape[0]
        df['Dataset'] = [retrieve_dataset_name(hydra_configs[exp_id])] * df.shape[0]
        df['Model'] = [exp_id2model_id[exp_id]] * df.shape[0]

        # optional fields
        if results_df_field2getter is not None:
            for field_name, getter in results_df_field2getter.items():
                df[field_name] = [getter(hydra_configs[exp_id])] * df.shape[0]
        results_dfs.append(df)

    results_df = pd.concat(results_dfs, axis=0)
    
    # Massage the data
    dataset_names = results_df['Dataset'].unique()

    per_dataset_results_long = {}
    per_dataset_results_wide = {}

    per_dataset_results_ci_lower = {}
    per_dataset_results_ci_upper = {}

    for dataset_name in dataset_names:
        dataset_long_df = results_df[results_df['Dataset'] == dataset_name].copy()
        # set the error term to the average of the empirical lower and upper confidence interval delta
        # dataset_long_df['Error'] = ((dataset_long_df['ci_score_lower'] + dataset_long_df['ci_score_upper']) / 2)
        # dataset_long_df['Score_with_ci'] = (dataset_long_df['Score']*100).round(2).astype(str) + ' ± ' + (dataset_long_df['Error']*100).round(2).astype(str)
        if n_bootstrap_samples is not None:
            dataset_long_df['Score_with_ci'] = (dataset_long_df['Score']*100).round(2).astype(str) + " [" + ((dataset_long_df['Score']-dataset_long_df['ci_score_lower'])*100).round(2).astype(str) + ", " + ((dataset_long_df['Score'] + dataset_long_df['ci_score_upper'])*100).round(2).astype(str) + "]"
            scores_column = 'Score_with_ci'
        else:
            dataset_long_df['Score_without_ci'] =  (dataset_long_df['Score']*100).round(2).astype(str)
            scores_column = 'Score_without_ci'
        
        per_dataset_results_long[dataset_name] = dataset_long_df

        pivot_index = ['Model']
        if results_df_field2getter is not None:
            pivot_index.extend(list(results_df_field2getter.keys()))
        dataset_wide_df = dataset_long_df.pivot_table(index=pivot_index, columns='Metric', values=scores_column, aggfunc=lambda x: ' '.join(x))
        dataset_wide_df = dataset_wide_df[['P', 'R', 'F1']]
        per_dataset_results_wide[dataset_name] = dataset_wide_df

    results_wide_df = pd.concat(per_dataset_results_wide.values(), keys=per_dataset_results_wide.keys(), names=['Dataset'], axis=1)
    results_wide_df = results_wide_df.sort_index(axis=0, level=results_ordering)
    results_wide_df.columns = pd.MultiIndex.from_tuples([(dataset_name2dataset_label[dataset_name], metric) for dataset_name, metric in results_wide_df.columns.values], names=['Dataset', f'Metric [{int(confidence_level*100)}% CI]'])

    # get a subplot for each dataset
    figsize = (10, 7 * len(dataset_names))
    fig, axes = plt.subplots(len(dataset_names), 1, figsize=figsize, sharex=True, sharey=True)
    
    if len(dataset_names) == 1:
        axes = [axes]

    for ax, dataset_name in zip(axes, dataset_names):
        dataset_results_long_df = per_dataset_results_long[dataset_name]
        # sort the dataframe accoring to the Constraint, Model and LP
        if barplot_hue_ordering is not None:
            dataset_results_long_df = dataset_results_long_df.sort_values(by=barplot_hue_ordering)
        dataset_results_long_df.reset_index(inplace=True, drop=True)

        hue_order = dataset_results_long_df['Exp_ID'].drop_duplicates().values.tolist()
        order = ['P', 'R', 'F1']
        sns.barplot(ax=ax, data=dataset_results_long_df, x='Metric', y="Score", hue='Exp_ID', edgecolor="black", palette='deep', order=order, hue_order=hue_order)

        ax.set_title(dataset_name2dataset_label[dataset_name])
        ax.set_ylabel("Score")
        ax.set_ylim(0,1)
        if ax != axes[0]:
            ax.set_xlabel("Metric")
        else:
            ax.set_xlabel("")

        # remove the box around the legend
        ax.legend(loc='upper left', bbox_to_anchor=(1, 1), ncol=1, frameon=False)

        if 'ci_score_lower' in dataset_results_long_df.columns:
            x_coords = [p.get_x() + 0.5 * p.get_width() for p in ax.patches]
            y_coords = [p.get_height() for p in ax.patches]

            dataset_results_long_df.Exp_ID = pd.Categorical(dataset_results_long_df.Exp_ID, categories=hue_order, ordered=True)
            dataset_results_long_df.Metric = pd.Categorical(dataset_results_long_df.Metric, categories=order, ordered=True)
            error = np.stack([dataset_results_long_df.sort_values(by=['Exp_ID', 'Metric'])['ci_score_lower'].values, dataset_results_long_df.sort_values(by=['Exp_ID', 'Metric'])['ci_score_upper'].values])
            ax.errorbar(x=x_coords, y=y_coords, yerr=error, fmt="none", color="black", capsize=barplot_capsize, zorder=10000)
    
    plt.show()

# Figure for the table above
if PRESENT_QUANTITATIVE_RESULTS:
    print("~~~ Micro ~~~")
    metric_labels = [metric_name2label[metric_id] for metric_id in metrics]
    display_barplots(metrics, metric_labels)

    print("~~~ Macro ~~~~")
    if PRESENT_MACRO_RESULTS:
        macro_metrics = [f"macro_{metric}" for metric in metrics]
        display_barplots(macro_metrics, metric_labels)

In [14]:
def get_predictions_df(exp_ids, seed):
    global n_predictions_to_show
    random_ids = None

    pred_dfs = []

    for exp_id in exp_ids:
        output_dataset = output_datasets[exp_id]

        
        if random_ids is None:
            random_indices = np.random.RandomState(seed).choice(len(output_dataset),  min([n_samples, len(output_dataset)]), replace=False)
            random_ids = set([output_dataset.data[i]['id'] for i in random_indices])

        # random_sample = [output_dataset.data[i] for i in random_indices if output_dataset.data[i]['id'] in random_ids]
        random_sample = [dp for dp in output_dataset.data if dp['id'] in random_ids]

        if present_triplets_in_the_canonical_linearization or not keep_duplicate_triplets:
            if n_predictions_to_show != 1:
                n_predictions_to_show = 1
                print(f"present_triplets_in_the_canonical_linearization={present_triplets_in_the_canonical_linearization} or keep_duplicate_triplets={keep_duplicate_triplets} supported only when n_predictions_to_show=1. Setting n_predictions_to_show=1")

        ids = [sample['id'] for sample in random_sample]
        inputs = [sample['input'] for sample in random_sample]
        targets = [sample['target'] for sample in random_sample]
        predictions = [sample['prediction'][0] if n_predictions_to_show == 1 else sample['prediction'][:n_predictions_to_show] for sample in random_sample]
        
        if present_triplets_in_the_canonical_linearization or not keep_duplicate_triplets:
            targets_triplets = [output_dataset.get_text_triples(target, return_set=(not keep_duplicate_triplets)) for target in targets]
            predictions_triplets = [output_dataset.get_text_triples(prediction, return_set=(not keep_duplicate_triplets)) for prediction in predictions]
            print("gaga")
            if present_triplets_in_the_canonical_linearization:
                lc = get_linearization_class("fully_expanded")
            else:
                lc = output_dataset.linearization_class
            
            targets = [lc.triplet_list_to_text(triplets)[0] for triplets in targets_triplets]
            predictions = [lc.triplet_list_to_text(triplets)[0] for triplets in predictions_triplets]

        df = pd.DataFrame(list(zip(ids, inputs, targets, predictions)), columns=['id', 'input', 'target', f'prediction_{exp_id}'])
        pred_dfs.append(df)

    predictions_df = reduce(lambda  left,right: pd.merge(left,right,on=['id', 'input', 'target'], how='inner', ), pred_dfs)
    predictions_df.sort_values(by=['id'], inplace=True)
    assert(predictions_df.shape[0] == len(random_ids))
    return predictions_df


def wrap_df_text(df, input_max_col, show_input_only):
    tdf = df.copy()
    tdf['input'] = tdf['input'].str.wrap(input_max_col)
    if show_input_only:
        tdf = tdf[['id', 'input']]
    else:
        tdf['target'] = tdf['target'].str.replace("\[e\]|\[et\]", "[e]\\n", regex=True)

        for col in tdf.columns:
            if col.startswith("prediction"):
                if type(tdf[col][0]) == str:
                    tdf[col] = tdf[col].str.replace("\[e\]|\[et\]", "[e]\\n", regex=True)
                else:
                    tdf[col] = tdf[col].apply(lambda x: "\\n".join([re.sub("\[e\]|\[et\]", "[e]\\n", p) for p in x]))

    display(HTML(tdf.to_html().replace("\\n","<br>")))


if PRESENT_QUALITATIVE_RESULTS:
    seed = 123

    # get the dataset_name for each experiment
    dataset_names = list([retrieve_dataset_name(hydra_configs[exp_id]) for exp_id in exp_ids])

    for dataset_name in set(dataset_names):
        print(f"~~~~ {dataset_name2dataset_label[dataset_name]} ~~~~")
        exp_ids_to_visualize = [exp_id for exp_id, exp_dataset_name in zip(exp_ids, dataset_names) if exp_dataset_name  == dataset_name]
        print(exp_ids_to_visualize)
        predictions_df = get_predictions_df(exp_ids_to_visualize, seed=seed)

        with pd.option_context('display.max_rows', None, 'display.max_columns', None, "display.expand_frame_repr", False, 'display.max_colwidth', None, 'display.width', None):        
            wrap_df_text(predictions_df, 50, SHOW_INPUT_ONLY)

~~~~ REBEL ~~~~
['SynthIE(large)-FE -- (genie_t5_tokenizeable) lp_0.8 [REBEL]']


Unnamed: 0,id,input
0,234,"Dwan Hurt (March 29, 1963 – November 25, 2016) was an American basketball coach and dean at Junipero Serra High School, was named 2010 State Coach of the Year by Cal-Hi Sports; Daily Breeze Coach of the Year 2010; and coached his high school alma mater, the Serra Cavaliers, to the 2010 California Interscholastic Federation Division III Boys Basketball State Championship."
1,585,Sidi Semiane is a town and commune in Tipaza Province in northern Algeria.
2,689,"Kinnikinnick Creek is a shallow waterway that is part of the Scioto River watershed, flowing through southern Pickaway and northern Ross Counties in Ohio."
3,790,"Bob Berkowitz is an American journalist, talk show host, and author."
4,866,"The Ruban Jaune ""(English; Yellow Ribbon)"" is a cycling title created in 1936 by Henri Desgrange, awarded to the rider recording the fastest average speed in a professional cycling race or stage longer than 200 km."
5,890,"Mount Lanning () is a mountain, high, located at the south side of Newcomer Glacier, southeast of Mount Warren, in the northern portion of the Sentinel Range, Ellsworth Mountains, Antarctica."
6,981,Bright Ideas is Portastatic's fifth studio album.
7,1206,Santiago Brouard or Santi Brouard (1919 in Lekeitio – 20 November 1984 in Bilbao) was a doctor and Basque politician.
8,1217,"Buchanan Point is a headland north-west of Cape Dundas and south-east of Mackintosh Cove, at the north-eastern end of Laurie Island in the South Orkney Islands of Antarctica."
9,1321,"It is a tributary of the Silk Stream, which is a tributary of the River Brent, which is a tributary of the River Thames."


In [28]:
ids_to_keep = [
    234,
    689,
    890,
    981,
    1699,
    1742,
    2009,
    2033,
    2560,
    2790,
    3486,
    3736,
    4476,
    6235,
    7172,
    8559,
    10141,
    10092,
    10443,
    11863,
    13175,
    17234,
    17823,
    27430,
    30120,
    35568,
    35660,
    38001,
    38596,
    39176,
    36891,
    35684,
    20664,
    14336,
    14223,
    12508,
    9732,
    33357,
    34872,
    28741,
    25635,
    24079,
    20308,
    6996,
    6469,
    5360,
    2186,
    33494,
    15676,
    30642,
    14762,
]

ids_to_keep = set(ids_to_keep)
print(len(ids_to_keep))

51


In [27]:
mask_to_keep = predictions_df.id.apply(lambda x: x in ids_to_keep)
predictions_df = predictions_df[mask_to_keep]
wrap_df_text(predictions_df, 50, SHOW_INPUT_ONLY)

Unnamed: 0,id,input
0,234,"Dwan Hurt (March 29, 1963 – November 25, 2016) was an American basketball coach and dean at Junipero Serra High School, was named 2010 State Coach of the Year by Cal-Hi Sports; Daily Breeze Coach of the Year 2010; and coached his high school alma mater, the Serra Cavaliers, to the 2010 California Interscholastic Federation Division III Boys Basketball State Championship."
2,689,"Kinnikinnick Creek is a shallow waterway that is part of the Scioto River watershed, flowing through southern Pickaway and northern Ross Counties in Ohio."
5,890,"Mount Lanning () is a mountain, high, located at the south side of Newcomer Glacier, southeast of Mount Warren, in the northern portion of the Sentinel Range, Ellsworth Mountains, Antarctica."
6,981,Bright Ideas is Portastatic's fifth studio album.
12,1699,Dark Scavenger is a point-and-click adventure role-playing game video game published and developed by Canadian indie studio Psydra Games.
13,1742,"Iris Rezende Machado (born 22 December 1933) is a Brazilian politician, member of the Brazilian Democratic Movement (MDB)."
15,2009,"Dennis Janke (born April 13, 1950 in Cleveland, Ohio) is an American comic book artist who was active in the industry from the mid-1980s to the mid-2000s, primarily as an inker."
16,2033,Bastian Kersaudy (born 9 June 1994) is a French badminton player from the Chambly Oise club.
20,2186,"Irmgard Bensusan (born 24 January 1991) is a South African born Paralympic sprinter who now competes for Germany, mainly in T44 classification events."
22,2560,"Cappellari Glacier () is a glacier long in the Hays Mountains, flowing west from the northwest shoulder of Mount Vaughan to enter Amundsen Glacier just north of Mount Dort."


In [37]:
from pathlib import Path

data_dir = "../data/"
output_file_name = "human_eval_2_dps.jsonl"
path_to_output_dir = os.path.join(data_dir, "human_eval")
path_to_output_file = os.path.join(path_to_output_dir, output_file_name)
Path(path_to_output_dir).mkdir(parents=False, exist_ok=True)
predictions_df.to_json(orient="records", lines=True, path_or_buf=path_to_output_file)

In [38]:
dps = general_helpers.read_jsonlines(os.path.join(path_to_output_dir, output_file_name))