In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from datasets import load_dataset, load_metric
from utils.results import *
from data_sets.data_utils import load_hsd_dataset,  get_suite
import numpy as np
import pandas as pd
import config
import collections

In [None]:
def get_diffs(df, ai_rules=False, score="seen"):
    df = df.set_index(["model", "method", "score"]).sort_index(key=lambda x: x.map(order))
    diffs_df = pd.DataFrame()
    if not ai_rules:
        comps = [("Task+Rules", "Task"),
                 ("Task+Rules+Ex", "Task+Ex"),
                 ("Task+Rules(chatGPT)+Ex", "Task+Ex"),
                 ("Task+Rules+Rat", "Task"),
                 ("Task+Rules+Ex+Rat", "Task+Ex")]
    else:
        comps = [("Task+Rules(chatGPT)+Ex", "Task+Rules+Ex")]
    for a,b in comps:
        row = df[(df.index.isin([a], level=1)) & (df.index.isin([score], level=2))] - df[df.index.isin([b], level=1)].values
        diffs_df = pd.concat([diffs_df, row], axis=0)
    return diffs_df

In [None]:
def get_all_diffs(results, return_df=False):
    df =pd.concat([x for x in results.values()])

    df = process_df(df)

    seen_baseline_diffs = get_diffs(df)

    func_baseline_diffs = get_diffs(df, score="funcOut")

    class_baseline_diffs = get_diffs(df, score="classOut")

    func_seen_diffs = df[df.score=="seen"].sort_values(["model","method"]).iloc[:,:-3] - df[df.score=="funcOut"].sort_values(["model","method"]).iloc[:,:-3].values

    class_seen_diffs = df[df.score=="seen"].sort_values(["model","method"]).iloc[:,:-3] - df[df.score=="classOut"].sort_values(["model","method"]).iloc[:,:-3].values

    class_func_diffs = df[df.score=="funcOut"].sort_values(["model","method"]).iloc[:,:-3] - df[df.score=="classOut"].sort_values(["model","method"]).iloc[:,:-3].values

    all_diffs = [seen_baseline_diffs, func_baseline_diffs, class_baseline_diffs, func_seen_diffs, class_seen_diffs, class_func_diffs]

    diffs_df = pd.concat([x.mean().round(2) for x in all_diffs], axis=1)

    diffs_df = diffs_df.rename(columns={i: col for i, col in enumerate(["s-b", "f-b", "c-b", "s-f", "s-c", "f-c"])})
    
    if not return_df:
        return diffs_df
    else:
        return diffs_df, df

In [None]:
def get_sample_diffs(task, result_path, suite_test, path):
    preds = load_results(result_path)

    preds = {k: np.array(v) for k, v in preds.items() if "seen" in k or "baseline" in k or "funcOut" in k or "classOut" in k}

    preds_df = pd.DataFrame.from_dict(preds, orient="index")

    df, test_idxs =suite_hits_df("pi", path, suite_test)

    keep_items = df.columns[~df.isna().any()].tolist()

    df = df.dropna(1)

    indices = [x[0] for x in list(test_idxs.values())]

    suite_test = suite_test.select(indices =indices)

    preds_df = preds_df[indices]

    suite_test = suite_test.select(indices =keep_items[:-3])

    preds_df = preds_df.T.iloc[keep_items[:-3]].T

    sample_diffs = get_diffs(df).T.reset_index(drop=True)

    sample_diffs["functionality"] = suite_test["functionality"]
    return sample_diffs, preds_df, suite_test

In [None]:
all_dfs = []

In [None]:
just_chatGPT = False

### SA

In [None]:
result_path = Path("./results/sa/suite/")

In [None]:
results = load_results(result_path, file_type="csv")

In [None]:
results = {k: v for k, v in results.items() if "seen" in k or "baseline" in k or "funcOut" in k or "classOut" in k}

In [None]:
if just_chatGPT:
    results = {k: v for k, v in results.items() if k.split("_")[0] == "chatGPT" and "rules" not in k}

In [None]:
diffs_df = get_all_diffs(results)

In [None]:
all_dfs.append(diffs_df)

### PI

In [None]:
result_path = Path("./results/pi/suite/")

In [None]:
results = load_results(result_path, file_type="csv")

In [None]:
results = {k: v for k, v in results.items() if "seen" in k or "baseline" in k or "funcOut" in k or "classOut" in k}

In [None]:
if just_chatGPT:
    results = {k: v for k, v in results.items() if k.split("_")[0] == "chatGPT" and "rules" not in k}

In [None]:
diffs_df, scores_df = get_all_diffs(results, return_df=True)

In [None]:
scores_df[scores_df.index.str.startswith("chat")]["Irrelevant preamble with different examples."]

In [None]:
all_dfs.append(diffs_df)

### RC

In [None]:
result_path = Path("./results/rc/suite/")

In [None]:
results = load_results(result_path, file_type="csv")

In [None]:
results = {k: v for k, v in results.items() if "seen" in k or "baseline" in k or "funcOut" in k or "classOut" in k}

In [None]:
if just_chatGPT:
    results = {k: v for k, v in results.items() if k.split("_")[0] == "chatGPT" and "rules" not in k}

In [None]:
diffs_df = get_all_diffs(results)

In [None]:
all_dfs.append(diffs_df)

### HSD

In [None]:
result_path = Path("./results/hsd/suite/")

In [None]:
results = load_results(result_path, hatecheck=True)

In [None]:
results = {k: v for k, v in results.items() if "seen" in k or "baseline" in k or "funcOut" in k or "classOut" in k}

In [None]:
if just_chatGPT:
    results = {k: v for k, v in results.items() if k.split("_")[0] == "chatGPT" and "rules" not in k}

In [None]:
diffs_df = get_all_diffs(results)

In [None]:
all_dfs.append(diffs_df)

### Aggregate

In [None]:
for df in all_dfs:
    print(df.mean())

In [None]:
full_df = pd.concat(all_dfs)

In [None]:
full_df = full_df[~full_df.index.isin(["avg"])]

In [None]:
full_df.mean()

In [None]:
full_df.head()

In [None]:
full_df.sort_values("s-b")

In [None]:
full_df.sort_values("f-b")

In [None]:
full_df.sort_values("c-b")

In [None]:
path = Path("results/pi/suite/")

In [None]:
suite_test = get_suite(config.pi_path)["test"]

In [None]:
result_path = Path("./results/pi/suite/")

In [None]:
sample_diffs_pi, preds_df_pi, suite_test_pi = get_sample_diffs("pi", result_path, suite_test, path)

In [None]:
sample_diffs_pi[sample_diffs_pi.functionality=="Simple coref: he and she"].mean(1).sort_values()

In [None]:
suite_test_pi[8882]

In [None]:
preds_df_pi.T.iloc[8882]["chatGPT_baseline_zero"]

In [None]:
preds_df_pi.T.iloc[8882]["chatGPT_seen_example_with_rules"]

In [None]:
sample_diffs_pi[sample_diffs_pi.functionality=="Irrelevant preamble with different examples."].mean(1).sort_values()

In [None]:
suite_test_pi[1428]

In [None]:
preds_df_pi.T.iloc[1428]["zephyr-7b-beta_baseline_example"]

In [None]:
preds_df_pi.T.iloc[1428]["zephyr-7b-beta_seen_example_with_rules"]

In [None]:
full_df.sort_values("s-f")

In [None]:
path = Path("results/sa/suite/")

In [None]:
suite_test = get_suite(config.sa_path)["test"]

In [None]:
result_path = Path("./results/sa/suite/")

In [None]:
sample_diffs_sa, preds_df_sa, suite_test_sa = get_sample_diffs("sa", result_path, suite_test, path)

In [None]:
sample_diffs_pi[sample_diffs_pi.functionality=="What are things a {noun} should worry about != should not worry about."].mean(1).sort_values()

In [None]:
suite_test_pi[7744]

In [None]:
preds_df_pi.T.iloc[7744]["chatGPT_seen_with_rules"]

In [None]:
preds_df_pi.T.iloc[7744]["chatGPT_funcOut_with_rules"]

In [None]:
sample_diffs_sa[sample_diffs_sa.functionality=="single positive words"].mean(1).sort_values()

In [None]:
suite_test_sa[2]

In [None]:
preds_df_sa.T.iloc[2]["chatGPT_seen_with_rules"]

In [None]:
preds_df_sa.T.iloc[2]["chatGPT_funcOut_with_rules"]

In [None]:
full_df.sort_values("s-c")

In [None]:
sample_diffs_sa[sample_diffs_sa.functionality=="protected: sexual"].mean(1).sort_values()

In [None]:
suite_test_sa[8561]

In [None]:
preds_df_sa.T.iloc[8561]["chatGPT_seen_example_with_rules"]

In [None]:
preds_df_sa.T.iloc[8561]["chatGPT_classOut_example_with_rules"]

In [None]:
sample_diffs_sa[sample_diffs_sa.functionality=="Q & A: yes (neutral)"].mean(1).sort_values()

In [None]:
suite_test_sa[19056]

In [None]:
preds_df_sa.T.iloc[19056]["chatGPT_seen_example_with_rules"]

In [None]:
preds_df_sa.T.iloc[19056]["chatGPT_classOut_example_with_rules"]

In [None]:
full_df.sort_values("f-c")

In [None]:
sample_diffs_sa[sample_diffs_sa.functionality=="neutral words in context"].mean(1).sort_values()

In [None]:
suite_test_sa[2613]

In [None]:
preds_df_sa.T.iloc[2613]["chatGPT_funcOut_with_rules"]

In [None]:
preds_df_sa.T.iloc[2613]["chatGPT_classOut_with_rules"]

In [None]:
sample_diffs_sa[sample_diffs_sa.functionality=="Q & A: yes (neutral)"].mean(1).sort_values()

In [None]:
suite_test_sa[18786]

In [None]:
preds_df_sa.T.iloc[19047]["chatGPT_funcOut_example_with_rules"]

In [None]:
preds_df_sa.T.iloc[19047]["chatGPT_classOut_example_with_rules"]

In [None]:
full_df.corr(method="kendall")