In [48]:
%load_ext autoreload
%autoreload 2
import os
import matplotlib.pyplot as plt
import seaborn as sns
from os.path import join
from tqdm import tqdm
import pandas as pd
import imodelsx.process_results
import sys
import datasets
import numpy as np
from copy import deepcopy
import clin.eval
import clin.llm
import clin.parse
from collections import defaultdict
import openai
openai.api_key_path = '/home/chansingh/.OPENAI_KEY'
from typing import List
results_dir = '../results/'
from clin.config import PATH_REPO
import clin.modules.ebm.extract
import clin.modules.ebm.omission
import clin.modules.ebm.prune
from clin.modules import ebm
import joblib

df = joblib.load(join(PATH_REPO, 'data', 'ebm', 'ebm_interventions_cleaned.pkl'))
df = df.iloc[:100]
nums = np.arange(len(df)).tolist()
np.random.default_rng(seed=13).shuffle(nums)
dfe = df.iloc[nums]
n = len(dfe)
llm = clin.llm.get_llm('text-davinci-003')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
r = {}
i = 0
extractor = ebm.extract.Extractor()
n_shots = 5
interventions = extractor(i, df, nums, n_shots, llm)

In [54]:
ov = ebm.omission.OmissionVerifier()
pv = ebm.prune.PruneVerifier()
r["list_original"] = [extractor(i, df, nums, n_shots, llm) for i in tqdm(range(len(df)))]

r["list_ov"] = [
    ov(dfe.iloc[i]["doc"], bullet_list=r["list_original"][i], llm=llm)
    for i in tqdm(range(n))
]


r["list_pv"] = [
    pv(dfe.iloc[i]["doc"], bullet_list=r["list_original"][i], llm=llm)
    for i in tqdm(range(n))
]

r["list_ov_pv"] = [
    pv(dfe.iloc[i]["doc"], bullet_list=r["list_ov"][i], llm=llm) for i in tqdm(range(n))
]

100%|██████████| 100/100 [00:00<00:00, 2762.45it/s]
100%|██████████| 100/100 [00:00<00:00, 10467.44it/s]
100%|██████████| 100/100 [00:00<00:00, 8900.57it/s]
100%|██████████| 100/100 [00:00<00:00, 7037.42it/s]


# Evaluate

In [55]:
def process_ebm_lists(
    pred_list: List[str],
    gt_list: List[str],
    verbose=False,
) -> List[bool]:
    # convert to lowercase
    pred_list = [pred.strip().lower() for pred in pred_list]
    gt_list = [gt.strip().lower() for gt in gt_list]


    return pred_list, gt_list

ks_list = [k for k in r.keys() if k.startswith("list_")]
for k in ks_list:
    mets_df = pd.DataFrame(
        [
            clin.eval.calculate_precision_recall_from_lists(
                *process_ebm_lists(r[k][i], dfe.iloc[i]["interventions"]),
                verbose=False,
            )
            for i in range(len(dfe))
        ]
    )
    mets_dict_single = clin.eval.aggregate_precision_recall(mets_df)
    for k_met in mets_dict_single.keys():
        r[k_met + "___" + k] = mets_dict_single[k_met]
    r["list_" + k] = r[k]

In [61]:
row_df = pd.DataFrame(
pd.Series({k: r[k] for k in r.keys() if "___" in k})
    .round(3)
    # .style.format(precision=3)
    # .background_gradient(cmap="Blues")
).T
# row_df = r[(r.n_shots == 5) * (r.checkpoint == 'text-davinci-003')]
rc = row_df[[c for c in row_df.columns if '___' in c]]
# create multindex columns by splitting on '___'
rc = rc.rename(columns={c: tuple(c.split('___')) for c in rc.columns})

# convert tuple column names to multiindex
rc.columns = pd.MultiIndex.from_tuples(rc.columns)
rc = rc.T.reset_index()
rc = rc.rename(columns={
    'level_0': '',
    'level_1': 'Verifiers',
}).pivot_table(index='Verifiers', columns='', values=0).round(3)
rc.style.format(precision=3).background_gradient(cmap='gray')

Unnamed: 0_level_0,f1,precision,recall
Verifiers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
list_original,0.458,0.441,0.476
list_ov,0.435,0.388,0.495
list_ov_pv,0.436,0.389,0.495
list_pv,0.458,0.441,0.476
original,0.458,0.441,0.476
ov,0.435,0.388,0.495


### Look at validation data

In [None]:
dfv = joblib.load(join(PATH_REPO, 'data', 'ebm', 'ebm_interventions_cleaned.pkl')).iloc[100:]
for i in range(len(dfv)):
    row = dfv.iloc[i]
    print(row['doc'])
    print(clin.parse.list_to_bullet_str(row['interventions']))
    print()