In [None]:
%load_ext autoreload
%autoreload 2

from datasets import load_dataset, load_metric
from data_sets.data_utils import load_hsd_dataset,  get_suite
from utils.results import load_results, get_dataset_scores, get_significance, get_suite_preds, load_hits, metric_max_over_ground_truths, exact_match_score
import json
import pandas as pd
from pathlib import Path
import json
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import hmean
import pickle
import collections
import config
import seaborn as sns

In [None]:
model_order = ["small", "base", "large", "xl", "xxl", "beta", "chatGPT"]

score_order = ["baseline", "Task", "Task+Rules", "Task+Spec"]

add_order = ["", "+Ex", "(chatGPT)+Ex", "+Rat", "+Ex+Rat"]

method_order = [score + add for score in score_order for add in add_order]

order = {x: i for i, x in enumerate(model_order + method_order)}

In [None]:
order

In [None]:
def plot_dataset_results(df, suite=False, values="accuracy"):
    ymin = .9 * df[values].min()
    ymax = 1.1 * df[values].max()
    ylim = (ymin, ymax)
    if suite:
        df = df[['seen' in s or "baseline" in s for s in df.index]].copy()
    
    df['model'] = [x.split("_")[0] for x in df.index]
    df['method'] = [x.split("_")[1] for x in df.index]
    df['method'] = ["Task" if "baseline" in x else f"Task+Rules" for x in df.method]
    df['method'] = [y+"(chatGPT)"  if "from_chatGPT" in x else y for x,y in zip(df.index, df.method)]
    df['method'] =  [y+"+Ex"  if "example" in x else y for x,y in zip(df.index, df.method)]
    df['method'] =  [y+"+Rat"  if "rules" in x else y for x,y in zip(df.index, df.method)]
    df["model"] = df.model.str.split("-").str[-1]
    n_method = len(np.unique(df.method))
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(8,5))
    return_df = df.copy()
    df = df.pivot(index=["model"], columns="method", values=values)
    df = df.sort_values(by="model", key=lambda x: x.map(order))
    df = df[sorted(df.columns, key= lambda item: order[item])]
    df.plot.bar(ax = ax, ylim=ylim)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, +1.05),
          fancybox=True, shadow=True, ncol=n_method)
    return fig, return_df

In [None]:
def process_suite_df(df, agg=True):
    df['model'] = [x.split("_")[0] for x in df.index]
    df['method'] = [x.split("_")[1] for x in df.index]
    df["score"] = df["method"]
    df['method'] = ["Task" if "baseline" in x else f"Task+Rules" for x in df.method]
    df['method'] = [y+"(chatGPT)"  if "from_chatGPT" in x else y for x,y in zip(df.index, df.method)]
    df['method'] =  [y+"+Ex"  if "example" in x else y for x,y in zip(df.index, df.method)]
    df['method'] =  [y+"+Rat"  if "rules" in x else y for x,y in zip(df.index, df.method)]
    df["model"] = df.model.str.split("-").str[-1]
    if agg:
        df = df.set_index(["model", "method"]).pivot(columns="score", values="avg")
        df = df.sort_index(key=lambda x: x.map(order))
    return df

In [None]:
def compute_gs(dataset_df, suite_df, dataset_metric, scale=100):
    dataset_df = dataset_df.sort_values(by=["model", "method"], key=lambda x: x.map(order))
    dataset_df = dataset_df.set_index(["model", "method"])
    suite_df = process_suite_df(suite_df)
    merge_df = pd.concat([dataset_df, suite_df], axis=1)
    df = pd.DataFrame()
    df.index = merge_df.index
    
    
    df["baseline"] = hmean([merge_df[dataset_metric]*scale, merge_df["baseline"]])
    df["seen"] = hmean([merge_df[dataset_metric]*scale, merge_df["seen"]])
    df["funcOut"] = hmean([merge_df[dataset_metric]*scale, merge_df["funcOut"]])
    df["classOut"] = hmean([merge_df[dataset_metric]*scale, merge_df["classOut"]])
    for method in ["Task", "Task+Ex"]:
        for score in ["seen", "funcOut", "classOut"]:
            df.loc[(slice(None), method), score] = df.loc[(slice(None), method), "baseline"]
    del df["baseline"]
    return df.sort_values(by=["model", "method"], key=lambda x: x.map(order))

In [None]:
def compare_func_scores(df):
    ret = {}
    comps = [("Task+Rules", "Task"),
             ("Task+Rules+Ex", "Task+Ex"),
             ("Task+Rules(chatGPT)+Ex", "Task+Ex"),
             ("Task+Rules+Rat", "Task"),
             ("Task+Rules+Ex+Rat", "Task+Ex")]
    scores = ["seen", "funcOut", "classOut"]
    for m, b in comps:
        for s in scores:
            ret[f"improv_{m}_{s}"] = pd.DataFrame((df.select_dtypes(include=np.number)[(df.method==m) & (df.score== s)] - df.select_dtypes(include=np.number)[(df.method==b)].values > 0).sum(1)).values.reshape(-1)
            ret[f"same_{m}_{s}"] = pd.DataFrame((df.select_dtypes(include=np.number)[(df.method==m) & (df.score== s)] - df.select_dtypes(include=np.number)[(df.method==b)].values == 0).sum(1)).values.reshape(-1)
            ret[f"worse_{m}_{s}"] = pd.DataFrame((df.select_dtypes(include=np.number)[(df.method==m) & (df.score== s)] - df.select_dtypes(include=np.number)[(df.method==b)].values < 0).sum(1)).values.reshape(-1)
    ret_df = pd.DataFrame.from_dict(ret)
    ret_df.index = ["small", "base", "large", "xl", "xxl", "beta", "chatGPT"]
    return ret_df

In [None]:
all_results = {}

In [None]:
suite_results = {}

## Sentiment Analysis

### Dataset

In [None]:
result_path = Path(f"./results/sa/sst2/")

In [None]:
results = load_results(result_path)

In [None]:
results = {k: v for k, v in results.items() if "seen" in k or "baseline" in k}

In [None]:
dataset_test = load_dataset("glue", "sst2")["validation"]

In [None]:
metric = load_metric("glue","sst2")

In [None]:
dataset_scores, preds = get_dataset_scores("sa", results, dataset_test["label"], metric)

In [None]:
sort = sorted(dataset_scores.items(), key=lambda item: item[1]["accuracy"], reverse=True)

In [None]:
for m, s in sort:
    if "zephyr" in m or "chatGPT" in m and "flan" not in m:
        print(m, s)

In [None]:
sort

In [None]:
df = pd.DataFrame.from_dict(dataset_scores, orient="index")

In [None]:
fig, sst_df = plot_dataset_results(df)

In [None]:
fig.savefig(f"../specification-instruction-paper/media/sa_dataset.pdf", bbox_inches = "tight")

### Suite

In [None]:
result_path = Path("./results/sa/suite/")

In [None]:
results = load_results(result_path, file_type="csv")

In [None]:
results = {k: v for k, v in results.items() if "seen" in k or "baseline" in k or "funcOut" in k or "classOut" in k}

In [None]:
df =pd.concat([x for x in results.values()])

In [None]:
df[df.index.str.contains("seen")]["Q & A: yes (neutral)"].sort_values()

In [None]:
df[df.index.str.contains("classOut")]["Q & A: yes (neutral)"].sort_values()

In [None]:
df["avg"].sort_values()

In [None]:
suite_results["sa"] = process_suite_df(df)

In [None]:
df[(df.model == "beta") & ((df.score=="seen") | (df.score=="baseline"))].T

In [None]:
df = df.sort_values(by=["model", "method"], key=lambda x: x.map(order))
comps = compare_func_scores(df)

In [None]:
comps.T

In [None]:
df.select_dtypes(include=np.number)[(df.method=="Task+Rules") & (df.score== "seen")] - df.select_dtypes(include=np.number)[(df.method=="Task")].values

In [None]:
fig, sa_df = plot_dataset_results(df, suite=True, values="avg")

In [None]:
fig.savefig(f"../specification-instruction-paper/media/sa_suite.pdf", bbox_inches = "tight")

In [None]:
all_results["sa"] = compute_gs(sst_df, df, "accuracy")

In [None]:
all_results["sa"]

In [None]:
# pvalues = get_pvalues_suite("sa", models=df.index.str.split("_").str[0].unique())

In [None]:
# df = pd.DataFrame.from_dict(pvalues, orient="index")
# df < .05

## Hate Speech detection

### Datasets

In [None]:
davidson_path = Path(f"./results/hsd/davidson2017/")
founta_path = Path(f"./results/hsd/founta2018/")

In [None]:
davidson_results = load_results(davidson_path)
founta_results = load_results(founta_path)

In [None]:
davidson_test = load_hsd_dataset("davidson2017")["test"]
founta_test = load_hsd_dataset("founta2018")["test"]

In [None]:
metric = load_metric("glue","qqp")

In [None]:
dataset_scores, preds = get_dataset_scores("hsd", davidson_results, davidson_test["label"], metric)

In [None]:
dataset_scores = {k: v for k, v in dataset_scores.items() if "seen" in k or "baseline" in k}

In [None]:
sort = sorted(dataset_scores.items(), key=lambda item: item[1]["f1"], reverse=True)

In [None]:
for m, s in sort:
    if "zephyr" in m or "chatGPT" in m and "flan" not in m:
        print(m, s)

In [None]:
sort

In [None]:
df = pd.DataFrame.from_dict(dataset_scores, orient="index")

In [None]:
fig, davidson_df = plot_dataset_results(df, values="f1")

In [None]:
fig.savefig(f"../specification-instruction-paper/media/hsd_davidson.pdf", bbox_inches = "tight")

In [None]:
# pvalues = get_pvalues_dataset(preds, "hsd", "davidson2017", davidson_test)

In [None]:
# df = pd.DataFrame.from_dict(pvalues, orient="index")
# df < .05

In [None]:
dataset_scores, preds = get_dataset_scores("hsd", founta_results, founta_test["label"], metric)

In [None]:
dataset_scores = {k: v for k, v in dataset_scores.items() if "seen" in k or "baseline" in k}

In [None]:
sort = sorted(dataset_scores.items(), key=lambda item: item[1]["f1"], reverse=True)

In [None]:
for m, s in sort:
    if "zephyr" in m or "chatGPT" in m and "flan" not in m:
        print(m, s)

In [None]:
sort

In [None]:
df = pd.DataFrame.from_dict(dataset_scores, orient="index")

In [None]:
fig, founta_df = plot_dataset_results(df, values="f1")

In [None]:
fig.savefig(f"../specification-instruction-paper/media/hsd_founta.pdf", bbox_inches = "tight")

In [None]:
# pvalues = get_pvalues_dataset(preds, "hsd", "founta2018", founta_test)

In [None]:
# df = pd.DataFrame.from_dict(pvalues, orient="index")
# df < .05

### Suite

In [None]:
result_path = Path("./results/hsd/suite/")

In [None]:
results = load_results(result_path, hatecheck=True)

In [None]:
results = {k: v for k, v in results.items() if "seen" in k or "baseline" in k or "funcOut" in k or "classOut" in k}

In [None]:
df =pd.concat([x for x in results.values()])

In [None]:
df *= 100

In [None]:
df["avg"].sort_values()

In [None]:
df[df.index.str.contains("flan-t5-xxl")]["avg"].sort_values()

In [None]:
suite_results["hsd"] = process_suite_df(df)

In [None]:
df[df.index.str.contains("zephyr")]["avg"].sort_values()

In [None]:
fig, hsd_df = plot_dataset_results(df, suite=True, values="avg")

In [None]:
fig.savefig(f"../specification-instruction-paper/media/hsd_suite.pdf", bbox_inches = "tight")

In [None]:
all_results["hsd_d"] = compute_gs(davidson_df, df, dataset_metric="f1"); all_results["hsd_d"]

In [None]:
all_results["hsd_f"] = compute_gs(founta_df, df, dataset_metric="f1"); all_results["hsd_f"]

In [None]:
# pvalues = get_pvalues_suite("hsd", models=df.index.str.split("_").str[0].unique())

In [None]:
# df = pd.DataFrame.from_dict(pvalues, orient="index")
# df < .05

## Reading comprehension

### Dataset

In [None]:
result_path = Path(f"./results/rc/squad/")

In [None]:
results = load_results(result_path)

In [None]:
results = {k: v for k, v in results.items() if "seen" in k or "baseline" in k}

In [None]:
dataset_test = load_dataset("squad")["validation"]

In [None]:
metric = load_metric("squad")

In [None]:
dataset_scores, preds = get_dataset_scores("rc", results, dataset_test["answers"], metric)

In [None]:
sort = sorted(dataset_scores.items(), key=lambda item: item[1]["exact_match"], reverse=True); sort

In [None]:
for m, s in sort:
    if "zephyr" in m or "chatGPT" in m and "flan" not in m:
        print(m, s)

In [None]:
# from utils.results import normalize_answer

# hits = {}
# for model, preds in results.items():
#     for answers, pred in zip(dataset_test["answers"], preds):
#         hit = 0
#         for answer in answers["text"]:
#             if normalize_answer(answer) in normalize_answer(pred):
#                 hit = 1
#                 break
#         hits.setdefault(model, []).append(hit)
    

In [None]:
# for model, hit in hits.items():
#     hits[model] = {}
#     hits[model]["recall"] = np.mean(hit)

In [None]:
# sorted(hits.items(), key=lambda item: item[1]["recall"], reverse=True)

In [None]:
df = pd.DataFrame.from_dict(dataset_scores, orient="index")

In [None]:
fig, squad_df = plot_dataset_results(df, values="exact_match")

In [None]:
fig.savefig(f"../specification-instruction-paper/media/rc_dataset.pdf", bbox_inches = "tight")

In [None]:
# pvalues = get_pvalues_dataset(preds, "rc", "squad", dataset_test)

In [None]:
# df = pd.DataFrame.from_dict(pvalues, orient="index")
# df < .05

### Suite

In [None]:
result_path = Path("./results/rc/suite/")

In [None]:
results = load_results(result_path, file_type="csv")

In [None]:
results = {k: v for k, v in results.items() if "seen" in k or "baseline" in k or "funcOut" in k or "classOut" in k}

In [None]:
# from utils.results import normalize_answer
# from collections import Counter

# def get_recall(prediction, ground_truth):
#     prediction_tokens = normalize_answer(prediction).split()
#     ground_truth_tokens = normalize_answer(ground_truth).split()
#     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
#     num_same = sum(common.values())
#     if num_same == 0:
#         return 0
#     precision = 1.0 * num_same / len(prediction_tokens)
#     recall = 1.0 * num_same / len(ground_truth_tokens)
#     f1 = (2 * precision * recall) / (precision + recall)
#     return recall

In [None]:
# from tqdm.auto import tqdm
# from data_sets.data_utils import get_suite
# import config
# from utils.results import f1_score

# suite_test = get_suite(config.rc_path)["test"]
# f1s = {}
# results_raw = load_results(result_path, file_type="json")
# results_raw = {k: v for k, v in results_raw.items() if "seen" in k or "baseline_zero" in k or "funcOut" in k or "classOut" in k}
# results_raw = get_suite_preds(results_raw, "rc")

In [None]:
# for model, preds in tqdm(results_raw.items()):
#     for example, pred in zip(suite_test, preds):
#         if example["answers"]["answer_start"][0] == -1:
#             continue
#         func = example["functionality"]
#         answer = example["answers"]["text"][0]
#         f1 = get_recall(pred, answer)
#         f1s.setdefault(model, {}).setdefault(func, []).append(f1)

In [None]:
# for model, scores in f1s.items():
#     for func, score in scores.items():
#         f1s[model][func] = np.mean(score) * 100

In [None]:
# for model, scores in f1s.items():
#     for func, score in scores.items():
#         results[model][func] = score 

In [None]:
df =pd.concat([x for x in results.values()])

In [None]:
# del df["avg"]
# df["avg"] = df.mean(axis=1)

In [None]:
df["avg"].sort_values()

In [None]:
suite_results["rc"] = process_suite_df(df)

In [None]:
fig, rc_df = plot_dataset_results(df, suite=True, values="avg")

In [None]:
fig.savefig(f"../specification-instruction-paper/media/rc_suite.pdf", bbox_inches = "tight")

In [None]:
all_results["rc"] = compute_gs(squad_df, df, dataset_metric="exact_match", scale=1.0); all_results["rc"]

In [None]:
# pvalues = get_pvalues_suite("rc", models=df.index.str.split("_").str[0].unique())

In [None]:
# df = pd.DataFrame.from_dict(pvalues, orient="index")
# df < .05

## Paraphrase identification

### Dataset

In [None]:
result_path = Path(f"./results/pi/qqp/")

In [None]:
results = load_results(result_path)

In [None]:
results = {k: v for k, v in results.items() if "seen" in k or "baseline" in k}

In [None]:
dataset_test = load_dataset("glue", "qqp")["validation"]

In [None]:
metric = load_metric("glue", "qqp")

In [None]:
dataset_scores, preds = get_dataset_scores("pi", results, dataset_test["label"], metric)

In [None]:
sort = sorted(dataset_scores.items(), key=lambda item: item[1]["accuracy"], reverse=True)

In [None]:
sort

In [None]:
for m, s in sort:
    if "zephyr" in m or "chatGPT" in m and "flan" not in m:
        print(m, s)

In [None]:
df = pd.DataFrame.from_dict(dataset_scores, orient="index")

In [None]:
fig, qqp_df = plot_dataset_results(df)

In [None]:
fig.savefig(f"../specification-instruction-paper/media/pi_dataset.pdf", bbox_inches = "tight")

In [None]:
# pvalues = get_pvalues_dataset(preds, "pi", "qqp", dataset_test)

In [None]:
# df = pd.DataFrame.from_dict(pvalues, orient="index")
# df < .05

### Suite

In [None]:
result_path = Path("./results/pi/suite/")

In [None]:
results = load_results(result_path, file_type="csv")

In [None]:
results = {k: v for k, v in results.items() if "seen" in k or "baseline" in k or "funcOut" in k or "classOut" in k}

In [None]:
df =pd.concat([x for x in results.values()])

In [None]:
df[df.index.str.contains("baseline_zero")]["Simple coref: he and she"]

In [None]:
df[df.index.str.contains("seen")]["Simple coref: he and she"].sort_values()

In [None]:
suite_results["pi"] = process_suite_df(df)

In [None]:
fig, pi_df = plot_dataset_results(df, suite=True, values="avg")

In [None]:
fig.savefig(f"../specification-instruction-paper/media/pi_suite.pdf", bbox_inches = "tight")

In [None]:
compute_gs(qqp_df, df, dataset_metric="accuracy")

In [None]:
all_results["pi"] = compute_gs(qqp_df, df, dataset_metric="accuracy"); all_results["pi"]

In [None]:
# pvalues = get_pvalues_suite("pi", models=df.index.str.split("_").str[0].unique())

In [None]:
# df = pd.DataFrame.from_dict(pvalues, orient="index")
# df < .05

## Table

In [None]:
import collections

order_task = {"sa": 0, "pi": 1, "rc": 2, "hsd_d": 3, "hsd_f":4}

In [None]:
od = collections.OrderedDict(sorted(all_results.items(), key= lambda item: order_task[item[0]]))

In [None]:
for task, df in od.items():
    df.columns = [f"{col}_{task}" for col in df.columns]
    

In [None]:
all_gs = pd.concat([df for df in od.values()], names=all_results.keys(), axis=1)

In [None]:
all_gs = all_gs.sort_values(by=["model", "method"], key=lambda x: x.map(order))

In [None]:
all_gs

In [None]:
all_gs["avg"] = all_gs.mean(1)

In [None]:
all_gs["avg"].sort_values()

In [None]:
all_gs["avg"]

In [None]:
all_gs.to_csv("./results/all_gs.csv")

In [None]:
all_gs = pd.read_csv("./results/all_gs.csv", index_col=[0,1])

In [None]:
all_gs

In [None]:
sa_df.to_csv("./results/sa_df.csv")
pi_df.to_csv("./results/pi_df.csv")
rc_df.to_csv("./results/rc_df.csv")
hsd_df.to_csv("./results/hsd_df.csv")
sst_df.to_csv("./results/sst_df.csv")
qqp_df.to_csv("./results/qqp_df.csv")
squad_df.to_csv("./results/squad_df.csv")
davidson_df.to_csv("./results/davidson_df.csv")
founta_df.to_csv("./results/founta_df.csv")

In [None]:
sa_df = pd.read_csv("./results/sa_df.csv", index_col=0)
pi_df = pd.read_csv("./results/pi_df.csv", index_col=0)
rc_df = pd.read_csv("./results/rc_df.csv", index_col=0)
hsd_df = pd.read_csv("./results/hsd_df.csv", index_col=0)
sst_df = pd.read_csv("./results/sst_df.csv", index_col=0)
qqp_df = pd.read_csv("./results/qqp_df.csv", index_col=0)
squad_df = pd.read_csv("./results/squad_df.csv", index_col=0)
davidson_df = pd.read_csv("./results/davidson_df.csv", index_col=0)
founta_df = pd.read_csv("./results/founta_df.csv", index_col=0)

In [None]:
with open("./results/all_suite_avgs.pkl", "wb") as file:
    pickle.dump(suite_results, file)

In [None]:
with open("./results/all_suite_avgs.pkl", "rb") as file:
    suite_results = pickle.load(file)

In [None]:
sa_df.set_index(["model", "method"]).sort_index(key= lambda x: x.map(order)).filter(regex="neutral words in context")

In [None]:
sa_df.set_index(["model", "method"]).sort_index(key= lambda x: x.map(order)).filter(regex="Hard: Negation of negative")

In [None]:
pi_df.set_index(["model", "method"]).sort_index(key= lambda x: x.map(order)).filter(regex="preamble|Preamble")

In [None]:
pi_df.set_index(["model", "method"]).sort_index(key= lambda x: x.map(order)).filter(regex="coref")

In [None]:
rc_df.set_index(["model", "method"]).sort_index(key= lambda x: x.map(order)).filter(regex="before")

In [None]:
hsd_df.set_index(["model", "method"]).sort_index(key= lambda x: x.map(order)).filter(regex="threat")

In [None]:
hsd_df.set_index(["model", "method"]).sort_index(key= lambda x: x.map(order)).filter(regex="arget_indiv_nh")

In [None]:
hsd_df.set_index(["model", "method"]).sort_index(key= lambda x: x.map(order)).filter(regex="counter_ref")

In [None]:
rc_df.set_index(["model", "method"]).sort_index(key= lambda x: x.map(order)).filter(regex="Under")

### Some statistics

In [None]:
# Rule improvement (no demonstrations)
for v in all_gs["avg"].xs("Task+Rules", level=1) - all_gs["avg"].xs("Task", level=1):
    print(np.round(v, 2))

In [None]:
# Rule improvemen (with demonstrations)
for v in all_gs["avg"].xs("Task+Rules+Ex", level=1) - all_gs["avg"].xs("Task+Ex", level=1):
    print(np.round(v, 2))

In [None]:
# impact of demonstrations
for v in all_gs["avg"].xs("Task+Ex", level=1) - all_gs["avg"].xs("Task", level=1):
    print(np.round(v, 2))

In [None]:
# impact of demonstrations
for v in all_gs["avg"].xs("Task+Rules+Ex", level=1) - all_gs["avg"].xs("Task+Rules", level=1):
    print(np.round(v, 2))

In [None]:
# impact of demonstrations
for v in all_gs["avg"].xs("Task+Rules+Ex+Rat", level=1) - all_gs["avg"].xs("Task+Rules+Rat", level=1):
    print(np.round(v, 2))

In [None]:
# impact of demonstrations
(all_gs.xs("Task+Ex", level=1) - all_gs.xs("Task", level=1)).mean()

In [None]:
(all_gs.xs("Task+Rules+Ex", level=1) - all_gs.xs("Task+Rules", level=1)).mean()

In [None]:
(all_gs.xs("Task+Rules+Ex+Rat", level=1) - all_gs.xs("Task+Rules+Rat", level=1)).mean()

In [None]:
# impact of rationales
for v in all_gs["avg"].xs("Task+Rules+Rat", level=1) - all_gs["avg"].xs("Task+Rules", level=1):
    print(np.round(v, 2))

In [None]:
# impact of Rationales
for v in all_gs["avg"].xs("Task+Rules+Ex+Rat", level=1) - all_gs["avg"].xs("Task+Rules+Ex", level=1):
    print(np.round(v, 2))

In [None]:
# impact of rationales
(all_gs.xs("Task+Rules+Rat", level=1) - all_gs.xs("Task+Rules", level=1)).mean()

In [None]:
(all_gs.xs("Task+Rules+Ex+Rat", level=1) - all_gs.xs("Task+Rules+Ex", level=1)).mean()

In [None]:
sa_df.select_dtypes(include=np.number).loc["flan-t5-large_seen"] - sa_df.select_dtypes(include=np.number).loc["flan-t5-large_baseline_zero"]

In [None]:
dataset_results = pd.concat([sst_df["model"], sst_df["method"], sst_df["accuracy"]*100, qqp_df["accuracy"]*100, squad_df["exact_match"], davidson_df["f1"]*100, founta_df["f1"]*100], axis=1)

In [None]:
dataset_results["avg"] = dataset_results.mean(1, numeric_only=True)

In [None]:
suites_df = pd.concat(suite_results.values(), axis=1)

In [None]:
suites_df["avg"] = suites_df.mean(1, numeric_only=True)

In [None]:
dataset_results.select_dtypes(include=np.number)[dataset_results["method"] == "Task+Rules+Ex"] - dataset_results.select_dtypes(include=np.number)[dataset_results["method"] == "Task+Ex"].values

In [None]:
(dataset_results.select_dtypes(include=np.number)[dataset_results["method"] == "Task+Ex"] - dataset_results.select_dtypes(include=np.number)[dataset_results["method"] == "Task"].values).mean()

In [None]:
(dataset_results.select_dtypes(include=np.number)[dataset_results["method"] == "Task+Rules+Ex"] - dataset_results.select_dtypes(include=np.number)[dataset_results["method"] == "Task+Rules"].values).mean()

In [None]:
(dataset_results.select_dtypes(include=np.number)[dataset_results["method"] == "Task+Rules+Ex+Rat"] - dataset_results.select_dtypes(include=np.number)[dataset_results["method"] == "Task+Rules+Rat"].values).mean()

In [None]:
(dataset_results.select_dtypes(include=np.number)[dataset_results["method"] == "Task+Rules+Rat"] - dataset_results.select_dtypes(include=np.number)[dataset_results["method"] == "Task+Rules"].values).mean()

In [None]:
(dataset_results.select_dtypes(include=np.number)[dataset_results["method"] == "Task+Rules+Ex+Rat"] - dataset_results.select_dtypes(include=np.number)[dataset_results["method"] == "Task+Rules+Ex"].values).mean()

In [None]:
for suite in suite_results.values():
    print(suite.select_dtypes(include=np.number).xs("Task+Rules+Ex", level=1)["seen"] - suite.select_dtypes(include=np.number).xs("Task+Ex", level=1)["baseline"])

In [None]:
suites_df.select_dtypes(include=np.number).xs("Task+Rules+Ex", level=1)["seen"].mean(1) - suites_df.select_dtypes(include=np.number).xs("Task+Ex", level=1)["baseline"].mean(1)

In [None]:
suites_df.select_dtypes(include=np.number).xs("Task+Rules", level=1)["seen"].mean(1) - suites_df.select_dtypes(include=np.number).xs("Task", level=1)["baseline"].mean(1)

In [None]:
for suite in suite_results.values():
    print((suite.select_dtypes(include=np.number).xs("Task+Ex", level=1).mean(1) - suite.select_dtypes(include=np.number).xs("Task", level=1).mean(1)).mean())

In [None]:
for suite in suite_results.values():
    print((suite.select_dtypes(include=np.number).xs("Task+Rules+Ex", level=1).mean(1) - suite.select_dtypes(include=np.number).xs("Task+Rules", level=1).mean(1)).mean())

In [None]:
for suite in suite_results.values():
    print((suite.select_dtypes(include=np.number).xs("Task+Rules+Ex+Rat", level=1).mean(1) - suite.select_dtypes(include=np.number).xs("Task+Rules+Rat", level=1).mean(1)).mean())

In [None]:
for suite in suite_results.values():
    print((suite.select_dtypes(include=np.number).xs("Task+Rules+Rat", level=1).mean(1) - suite.select_dtypes(include=np.number).xs("Task+Rules", level=1).mean(1)).mean())

In [None]:
for suite in suite_results.values():
    print((suite.select_dtypes(include=np.number).xs("Task+Rules+Ex+Rat", level=1).mean(1) - suite.select_dtypes(include=np.number).xs("Task+Rules+Ex", level=1).mean(1)).mean())

In [None]:
all_gs.xs("Task",level=1)

In [None]:
ruleMethods = ["Task+Rules", "Task+Rules+Ex", "Task+Rules(cGPT)+Ex", "Task+Rules+Rat", "Task+Rules+Ex+Rat"]

In [None]:
ruleMethodsDf = all_gs.loc[pd.IndexSlice[:, ruleMethods], :]

In [None]:
# impact on unseen cases
for v in (ruleMethodsDf.iloc[:,range(0, len(all_gs.columns) - 1, 3)] - ruleMethodsDf.iloc[:,range(1, len(all_gs.columns) - 1, 3)].values).mean().values:
    print(np.round(v, 2))

In [None]:
# impact on unseen cases
(ruleMethodsDf.iloc[:,range(1, len(all_gs.columns) - 1, 3)] - ruleMethodsDf.iloc[:,range(0, len(all_gs.columns) - 1, 3)].values).mean().mean()

In [None]:
# impact on unseen cases
for v in (ruleMethodsDf.iloc[:,range(0, len(all_gs.columns) - 1, 3)] - ruleMethodsDf.iloc[:,range(2, len(all_gs.columns) - 1, 3)].values).mean().values:
    print(np.round(v, 2))

In [None]:
# impact on unseen cases
(ruleMethodsDf.iloc[:,range(2, len(all_gs.columns) - 1, 3)] - ruleMethodsDf.iloc[:,range(0, len(all_gs.columns) - 1, 3)].values).mean().mean()

In [None]:
ruleMethodsDf.iloc[:,range(0, len(all_gs.columns) - 1, 3)].mean()

In [None]:
ruleMethodsDf.iloc[:,range(1, len(all_gs.columns) - 1, 3)].mean()

In [None]:
ruleMethodsDf.iloc[:,range(2, len(all_gs.columns) - 1, 3)].mean()

In [None]:
for v in (ruleMethodsDf.iloc[:,range(0, len(all_gs.columns) - 1, 3)] - ruleMethodsDf.iloc[:,range(1, len(all_gs.columns) - 1, 3)].values >=0).mean().values:
    print(np.round(v*100, 2))

In [None]:
for v in (all_gs.iloc[:,range(0, len(all_gs.columns) - 1, 3)] - all_gs.iloc[:,range(2, len(all_gs.columns) - 1, 3)].values >=0).mean():
    print(np.round(v*100, 2))

In [None]:
# Rule improvemen (with demonstrations)
for v in all_gs["avg"].xs("Task+Rules+Ex", level=1) - all_gs["avg"].xs("Task+Ex", level=1):
    print(np.round(v, 2))

In [None]:
# ChatGPT rules vs human rules

(all_gs.xs("Task+Rules(chatGPT)+Ex", level=1) - all_gs.xs("Task+Rules+Ex", level=1)).mean(1).round(2)

In [None]:
(all_gs.xs("Task+Rules(chatGPT)+Ex", level=1) - all_gs.xs("Task+Ex", level=1)).mean(1).round(2)

In [None]:
all_gs.xs("chatGPT", level=0).iloc[:,range(0, len(all_gs.columns) - 1, 3)] 

In [None]:
all_gs.xs("chatGPT", level=0).iloc[:,range(0, len(all_gs.columns) - 1, 3)].mean(1) 

In [None]:
all_gs.xs("chatGPT", level=0).iloc[:,range(1, len(all_gs.columns) - 1, 3)].mean(1)

In [None]:
all_gs.xs("chatGPT", level=0).iloc[:,range(2, len(all_gs.columns) - 1, 3)].mean(1)

### Table

In [None]:
with open(f"./results/pvalues.pickle", "rb") as file:
        pvalues = pickle.load(file)

In [None]:
pvalues_df = pd.DataFrame.from_dict(pvalues, orient="index").reset_index()

In [None]:
pvalues_df = pvalues_df.rename(columns  ={"level_0": "model", "level_1":"method"})

In [None]:
pvalues_df = pd.concat([pvalues_df.drop('sa', axis=1), pd.DataFrame(pvalues_df['sa'].tolist()).rename(columns={"seen": "seen_sa",
                                                                                                  "funcOut": "funcOut_sa",
                                                                                                  "classOut": "classOut_sa"})], axis=1)

In [None]:
pvalues_df = pd.concat([pvalues_df.drop('pi', axis=1), pd.DataFrame(pvalues_df['pi'].tolist()).rename(columns={"seen": "seen_pi",
                                                                                                  "funcOut": "funcOut_pi",
                                                                                                  "classOut": "classOut_pi"})], axis=1)

In [None]:
pvalues_df = pd.concat([pvalues_df.drop('rc', axis=1), pd.DataFrame(pvalues_df['rc'].tolist()).rename(columns={"seen": "seen_rc",
                                                                                                  "funcOut": "funcOut_rc",
                                                                                                  "classOut": "classOut_rc"})], axis=1)

In [None]:
pvalues_df = pd.concat([pvalues_df.drop('hsd_f', axis=1), pd.DataFrame(pvalues_df['hsd_f'].tolist()).rename(columns={"seen": "seen_hsd_f",
                                                                                                  "funcOut": "funcOut_hsd_f",
                                                                                                  "classOut": "classOut_hsd_f"})], axis=1)

In [None]:
pvalues_df = pd.concat([pvalues_df.drop('hsd_d', axis=1), pd.DataFrame(pvalues_df['hsd_d'].tolist()).rename(columns={"seen": "seen_hsd_d",
                                                                                                  "funcOut": "funcOut_hsd_d",
                                                                                                  "classOut": "classOut_hsd_d"})], axis=1)

In [None]:
pvalues_df = pd.concat([pvalues_df.drop('avg', axis=1), pvalues_df['avg']], axis=1)

In [None]:
method_map = {
    "": "Task+Rules",
    "example": "Task+Rules+Ex",
    "from_chatGPT_example": "Task+Rules(chatGPT)+Ex",
    "with_rules": "Task+Rules+Rat",
    "example_with_rules": "Task+Rules+Ex+Rat"
}

In [None]:
pvalues_df['model'] = [x.split("-")[-1] for x in pvalues_df.model] 
pvalues_df['method'] = [method_map[x] for x in pvalues_df.method]

In [None]:
pvalues_df = pvalues_df.set_index(["model", "method"])

In [None]:
table = pvalues_df.style

In [None]:
table = table.format(precision=3)

In [None]:
print(table.to_latex(multirow_align="t"))

In [None]:
pvalues_df.loc[("small", "Task"), :] = 16*[1.0]
pvalues_df.loc[("base", "Task"), :] = 16*[1.0]
pvalues_df.loc[("large", "Task"), :] = 16*[1.0]
pvalues_df.loc[("xl", "Task"), :] = 16*[1.0]
pvalues_df.loc[("xxl", "Task"), :] = 16*[1.0]
pvalues_df.loc[("chatGPT", "Task"), :] = 16*[1.0]
pvalues_df.loc[("beta", "Task"), :] = 16*[1.0]

In [None]:
pvalues_df.loc[("small", "Task+Ex"), :] = 16*[1.0]
pvalues_df.loc[("base", "Task+Ex"), :] = 16*[1.0]
pvalues_df.loc[("large", "Task+Ex"), :] = 16*[1.0]
pvalues_df.loc[("xl", "Task+Ex"), :] = 16*[1.0]
pvalues_df.loc[("xxl", "Task+Ex"), :] = 16*[1.0]
pvalues_df.loc[("chatGPT", "Task+Ex"), :] = 16*[1.0]
pvalues_df.loc[("beta", "Task"), :] = 16*[1.0]

In [None]:
pvalues_df = pvalues_df.sort_index(key= lambda x: x.map(order))

In [None]:
significant = pvalues_df < .05

In [None]:
l = all_gs.style

In [None]:
l = l.format(precision=2)

In [None]:
def baseline_index(i):
    if i in [2, 5]: return 0
    else: return 1

In [None]:
def color_values(df, significant):
    better = "textcolor: {PineGreen} --rwrap;"
    worse = "textcolor: {red} --rwrap;"
    best = "textbf:--rwrap;"
    same = '' 
    df1 =  pd.DataFrame(same, index=df.index, columns=df.columns)
#     bb = df.apply(lambda x: [v == all_gs.iloc[i//7*7:i//7*7+7].to_numpy().max() for i, v in enumerate(x)], axis = 0)
    b = df.apply(lambda x: [i% 7 not in  [0, 1] and v > x.iloc[i//7*7 + baseline_index(i%7)] for i, v in enumerate(x)], axis = 0)
    w = df.apply(lambda x: [i% 7 not in  [0, 1] and v <  x.iloc[i//7*7 + baseline_index(i%7)] for i, v in enumerate(x)], axis = 0)
    return df1.mask(b, better).mask(w, worse).mask(~significant, same)

In [None]:
def color(df):
    better = "textcolor: {PineGreen} --rwrap;"
    worse = "textcolor: {red} --rwrap;"
    best = "textbf:--rwrap;"
    same = '' 
    df1 =  pd.DataFrame(same, index=df.index, columns=df.columns)
#     bb = df.apply(lambda x: [v == all_gs.iloc[i//7*7:i//7*7+7].to_numpy().max() for i, v in enumerate(x)], axis = 0)
    b = df.apply(lambda x: [i% 7 not in  [0, 1] and v > x.iloc[i//7*7 + baseline_index(i%7)] for i, v in enumerate(x)], axis = 0)
    w = df.apply(lambda x: [i% 7 not in  [0, 1] and v <  x.iloc[i//7*7 + baseline_index(i%7)] for i, v in enumerate(x)], axis = 0)
#     return df1.mask(bb, best).mask(b, better).mask(w, worse)
    return df1.mask(b, better).mask(w, worse)

In [None]:
from functools import partial

color = partial(color_values, significant=significant)

In [None]:
l = l.apply(color, axis=None)

In [None]:
l = l.highlight_max(axis=0, props="textbf:--rwrap;")

In [None]:
print(l.to_latex(multirow_align="t"))

In [None]:
all_gs.iloc[:,0:9].mean(1).sort_values()

## Figures

In [None]:
all_gs

In [None]:
cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']

In [None]:
df.index.values[:-2]

In [None]:
from cycler import cycler

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,3), gridspec_kw=dict(wspace=0.1))
titles = ["w/o examplars", "w/ exemplars"]
for row in range(2):
    df = all_gs.reset_index().copy()
    if row == 0:
        df = df[~df.method.str.contains("Ex")]
    else:
        df = df[df.method.str.contains("Ex")]
    df.method = df.method.str.replace("Rules", "Spec")
    df["sem"] = df.iloc[:,:-1].sem(1)
    df = df.pivot(index=["model"], columns="method", values=["avg", "sem"])
    df = df.sort_values(by="model", key=lambda x: x.map(order))
    df = df.rename({"beta": "zephyr"})
#     df = df[sorted(df.columns, key= lambda item: order[item])]
    curr_cycle = cycle if row == 0 else [cycle[0]] + [cycle[3]] + cycle[1:3]
    custom_cycler = (cycler(color=curr_cycle))
    axes[row].set_prop_cycle(custom_cycler)
    df.plot(y = "avg", ax = axes[row], yerr="sem", marker = ".", markersize=6, linestyle="none", capsize=4, ylim=(25,75), color=curr_cycle)
    axes[row].plot(df.index.values[:-2], df['avg'].iloc[:-2], linestyle='-')
    axes[row].title.set_text(titles[row])
    axes[row].legend(title="")
# handles, labels = axes[1].get_legend_handles_labels()
# fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, +1.05),
#           fancybox=True, shadow=True, ncol=7)

In [None]:
fig.savefig(f"../specification-instruction-paper/media/avg_results.pdf", bbox_inches = "tight")

In [None]:
dfs = [
    sst_df,
    qqp_df,
    squad_df,
    davidson_df,
    founta_df,
    sa_df,
    pi_df,
    rc_df,
    hsd_df,
]

In [None]:
values = ["accuracy", "accuracy", "exact_match", "f1", "f1", "avg", "avg", "avg", "avg"]

In [None]:
titles = ["SENT (SST2)", "PARA (QQP)", "READ (SQuAD)", "HATE (Davidson)", "HATE (Founta)", "SENT (Suite)", "PARA (Suite)", "READ (Suite)", "HATE (Suite)"]

In [None]:
def jitter(x):
    return x + np.random.uniform(0, .5) -.25

In [None]:
type_ids = {'small': 1, 'base': 2, 'large': 3, "xl": 4, "xxl": 5, "zephyr": 6, "chatGPT": 7}

In [None]:
df

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(15,6), gridspec_kw=dict(wspace=0.22))
for row in range(2):
    for col in range(5):
        i = row*5 + col
        if row == 1 and col==4: 
            fig.delaxes(axes[row][col])
            break
        value = values[i]
        df = dfs[i].copy()
        df = df[~df.method.str.contains("Ex")]
        df.method = df.method.str.replace("Rules", "Spec")
        if row == 0 and col != 2: df[value] = df[value] *100
        df = df.pivot(index=["model"], columns="method", values=value)
        df = df.sort_values(by="model", key=lambda x: x.map(order))
        df = df[sorted(df.columns, key= lambda item: order[item])]
        df = df.rename({"beta": "zephyr"})
        curr_cycle = cycle[:3]
        custom_cycler = (cycler(color=curr_cycle))
        axes[row,col].set_prop_cycle(custom_cycler)
        df = df.reset_index()
        df['type_id'] = df['model'].apply(lambda x: type_ids[x])
        df['jitter_type'] = df['type_id'].apply(lambda x: jitter(x))
        df.plot(ax = axes[row,col], x='jitter_type', y=["Task", "Task+Spec", "Task+Spec+Rat"], legend=0, marker = ".", markersize=6, linestyle="none")
        axes[row,col].plot(df["jitter_type"][:-2], df.iloc[:-2][["Task", "Task+Spec", "Task+Spec+Rat"]], linestyle='-')
#         df.plot.line(ax = axes[row, col], legend=0, marker = ".", markersize=6)
        axes[row,col].xaxis.label.set_visible(False)
        axes[row,col].title.set_text(titles[i])
        axes[row,col].set_xticks(range(len(df)))
        if row != 0: axes[row, col].set_xticklabels([item for item in df["model"].tolist()], rotation=90)
        if row == 0: axes[row,col].tick_params('x', labelbottom=False)
#         if col != 0: axes[row,col].tick_params('y', labelleft=False)
handles, labels = axes[1,3].get_legend_handles_labels()
fig.legend(handles, labels,  loc='upper left', bbox_to_anchor=(0.75, 0.45))

In [None]:
fig.savefig(f"../specification-instruction-paper/media/dataset_results_line.pdf", bbox_inches = "tight")

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(15,6), gridspec_kw=dict(wspace=0.22))
for row in range(2):
    for col in range(5):
        i = row*5 + col
        if row == 1 and col==4: 
            fig.delaxes(axes[row][col])
            break
        value = values[i]
        df = dfs[i].copy()
        df = df[df.method.str.contains("Ex")]
        df.method = df.method.str.replace("Rules", "Spec")
        if row == 0 and col != 2: df[value] = df[value] *100
        df = df.pivot(index=["model"], columns="method", values=value)
        df = df.sort_values(by="model", key=lambda x: x.map(order))
        df = df[sorted(df.columns, key= lambda item: order[item])]
        df = df.rename({"beta": "zephyr"})
        curr_cycle = cycle[0:2] + [cycle[3]] + [cycle[2]]
        custom_cycler = (cycler(color=curr_cycle))
        axes[row,col].set_prop_cycle(custom_cycler)
        df = df.reset_index()
        df['type_id'] = df['model'].apply(lambda x: type_ids[x])
        df['jitter_type'] = df['type_id'].apply(lambda x: jitter(x))
        df.plot(ax = axes[row,col], x='jitter_type', y=["Task+Ex", "Task+Spec+Ex", "Task+Spec(chatGPT)+Ex", "Task+Spec+Ex+Rat"], legend=0, marker = ".", markersize=6, linestyle="none")
        axes[row,col].plot(df["jitter_type"][:-2], df.iloc[:-2][["Task+Ex", "Task+Spec+Ex", "Task+Spec(chatGPT)+Ex", "Task+Spec+Ex+Rat"]], linestyle='-')
#         df.plot.line(ax = axes[row, col], legend=0, marker = ".", markersize=6)
        axes[row,col].xaxis.label.set_visible(False)
        axes[row,col].title.set_text(titles[i])
        axes[row,col].set_xticks(range(len(df)))
        if row != 0: axes[row, col].set_xticklabels([item for item in df["model"].tolist()], rotation=90)
        if row == 0: axes[row,col].tick_params('x', labelbottom=False)
#         if col != 0: axes[row,col].tick_params('y', labelleft=False)
handles, labels = axes[1,3].get_legend_handles_labels()
fig.legend(handles, labels,  loc='upper left', bbox_to_anchor=(0.75, 0.45))

In [None]:
fig.savefig(f"../specification-instruction-paper/media/dataset_results_line(+ex).pdf", bbox_inches = "tight")

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(16,5), gridspec_kw=dict(wspace=0.1))
for row in range(2):
    for col in range(5):
        i = row*5 + col
        if row == 1 and col==4: 
            fig.delaxes(axes[row][col])
            break
        value = values[i]
        df = dfs[i].copy()
        if row == 0 and col != 2: df[value] = df[value] *100
        df = df.pivot(index=["method"], columns="model", values=value)
        df = df.sort_values(by="method", key=lambda x: x.map(order))
        df = df[sorted(df.columns, key= lambda item: order[item])]
        df.plot.bar(ax = axes[row, col], legend=0, ylim=(0,100), width=.8, rot=90)
        axes[row,col].xaxis.label.set_visible(False)
        axes[row,col].title.set_text(titles[i])
        if row == 0: axes[row,col].tick_params('x', labelbottom=False)
        if col != 0: axes[row,col].tick_params('y', labelleft=False)
handles, labels = axes[1,3].get_legend_handles_labels()
fig.legend(handles, labels,  loc='upper left', bbox_to_anchor=(0.75, 0.45))

In [None]:
fig.savefig(f"../specification-instruction-paper/media/dataset_results_by_method.pdf", bbox_inches = "tight")

In [None]:
suite_dfs = dfs[5:]

In [None]:
suites_agg_df = pd.concat([x[["avg", "model", "method"]] if i == 0 else x["avg"] for i, x in enumerate(suite_dfs)], axis = 1)

In [None]:
suites_agg_df.columns = ["avg_sa", "model", "method", "avg_pi", "avg_rc", "avg_hsd"]

In [None]:
suites_agg_df["avg"] = suites_agg_df.mean(1)

In [None]:
data_dfs = [dataset_results, suites_agg_df]

In [None]:
dataset_results = dataset_results.sort_values(by=["model", "method"], key=lambda x: x.map(order))

In [None]:
suites_agg_df = suites_agg_df.sort_values(by=["model", "method"], key=lambda x: x.map(order))

In [None]:
comps = []
for method in ["Task+Rules", "Task+Rules+Ex", "Task+Rules(chatGPT)+Ex", "Task+Rules+Rat", "Task+Rules+Ex+Rat"]:
    if "Ex" in method:
        base = "Task+Ex"
    else:
        base = "Task"
    comps.append(dataset_results.select_dtypes(include=np.number)[dataset_results.method == method] - dataset_results.select_dtypes(include=np.number)[dataset_results.method == base].values)
    comps[-1]["method"] = dataset_results[dataset_results.method == method].method
    comps[-1]["model"] = dataset_results[dataset_results.method == method].model
dataset_diffs = pd.concat(comps)

In [None]:
comps = []
df = suites_agg_df.copy()
for method in ["Task+Rules", "Task+Rules+Ex", "Task+Rules(chatGPT)+Ex", "Task+Rules+Rat", "Task+Rules+Ex+Rat"]:
    if "Ex" in method:
        base = "Task+Ex"
    else:
        base = "Task"
    comps.append(df.select_dtypes(include=np.number)[df.method == method] - df.select_dtypes(include=np.number)[df.method == base].values)
    comps[-1]["method"] = df[df.method == method].method
    comps[-1]["model"] = df[df.method == method].model
suite_diffs = pd.concat(comps)

In [None]:
dataset_diffs[~dataset_diffs.method.str.contains("Rat")]

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(15,6))
values = ['accuracy_sa', 'accuracy_pi', 'exact_match', 'f1_d', 'f1_f', 'avg_sa', 'avg_pi', 'avg_rc', 'avg_hsd']
titles = ["SST2", "QQP", "SQuAD", "Davidson", "Founta", "SENT Suite", "PARA Suite", "READ Suite", "HATE suite"]
dataset_diffs.columns = ['accuracy_sa', 'accuracy_pi', 'exact_match', 'f1_d', 'f1_f', "avg", "method", "model"]
data_dfs = [dataset_diffs[~dataset_diffs.method.str.contains("Rat")], suite_diffs[~suite_diffs.method.str.contains("Rat")]]
for row in range(2):
    for col in range(5):
        i = row*5 + col
        if row == 1 and col==4: 
            fig.delaxes(axes[row][col])
            break
        value = values[i]
        df = data_dfs[row].copy()
        df.method = df.method.str.replace("Rules", "Spec")
        df = df.pivot(index=["model"], columns="method", values=value)
        df = df.sort_values(by="model", key=lambda x: x.map(order))
        df = df[sorted(df.columns, key= lambda item: order[item])]
        df = df.rename({"beta": "zephyr"})
        df.plot.bar(ax = axes[row, col], legend=0, width=.8, color = cycle[2:])
        axes[row,col].xaxis.label.set_visible(False)
        axes[row,col].title.set_text(titles[i])
        axes[row, col].axhline(0, color='black', linewidth=.5)
#         axes[row, col].set_yscale("symlog")
        if row == 0: axes[row,col].tick_params('x', labelbottom=False)
#         if col != 0: axes[row,col].tick_params('y', labelleft=False)
handles, labels = axes[1,3].get_legend_handles_labels()
fig.legend(handles, labels,  loc='upper left', bbox_to_anchor=(0.75, 0.45))

In [None]:
fig.savefig(f"../specification-instruction-paper/media/dataset_results_diffs.pdf", bbox_inches = "tight")

In [None]:
data_dfs = [dataset_diffs[~dataset_diffs.method.str.contains("Rat")][["model", "method", "avg"]], suite_diffs[~dataset_diffs.method.str.contains("Rat")][["model", "method", "avg"]]]

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(12,6), gridspec_kw=dict(wspace=0.1))
titles = ["Datasets", "Suites"] 
for row in range(2):
    df = data_dfs[row].copy().reset_index()
    df.method = df.method.str.replace("Rules", "Spec")
    df = df.pivot(index=["model"], columns="method", values="avg")
    df = df.sort_values(by="model", key=lambda x: x.map(order))
    df = df[sorted(df.columns, key= lambda item: order[item])]
    df = df.rename({"beta": "zephyr"})
    df.plot.bar(ax = axes[row], legend=0, width=.8, color = cycle[2:])
    axes[row].xaxis.label.set_visible(False)
    axes[row].title.set_text(titles[row])
#     axes[row].set_yscale("symlog", linthresh = .01)
    axes[row].axhline(0, color='black', linewidth=.5)
    if row == 0: axes[row].tick_params('x', labelbottom=False)
#     if col != 0: axes[row,col].tick_params('y', labelleft=False)
handles, labels = axes[1].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, +1.05),
          fancybox=True, shadow=True, ncol=7)

In [None]:
fig.savefig(f"../specification-instruction-paper/media/dataset_results_agg_diffs.pdf", bbox_inches = "tight")

### Rule generation quality vs functionality performances

In [None]:
func_scores = pd.concat([sa_df, pi_df, rc_df, hsd_df], axis=1)

In [None]:
rule_ratings = pd.read_csv("./data/chatGPTgeneratedRules/rule_ratings.csv").T

In [None]:
tasks = 36*["SENT"] + 53*["PARA"] + 24*["READ"] + 29*["HATE"]

In [None]:
rule_ratings.columns = rule_ratings.loc["Functionality"]

In [None]:
rule_ratings

In [None]:
func_scores.loc["ratings"] = rule_ratings.loc["Rating"]

In [None]:
func_scores = func_scores.T

In [None]:
func_scores = func_scores.iloc[:-3]

In [None]:
func_scores = func_scores.drop(index=["avg", "model", "method", "score"])

In [None]:
func_scores["task"] = tasks

In [None]:
func_scores

In [None]:
func_scores.groupby(["task", "ratings"]).mean()[["chatGPT_seen_example", "chatGPT_seen_from_chatGPT_example"]]

In [None]:
func_scores.groupby(["task", "ratings"]).mean()[["chatGPT_seen_example", "chatGPT_seen_from_chatGPT_example"]]

In [None]:
func_scores.groupby(["task", "ratings"]).mean()[["chatGPT_seen_example", "chatGPT_seen_from_chatGPT_example"]]

In [None]:
func_scores[["chatGPT_seen_example", "chatGPT_seen_from_chatGPT_example"]]

In [None]:
plt.rcParams.update({'font.size': 13})

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10,5), gridspec_kw=dict(wspace=0.1), sharex=True, sharey=False)
axes = func_scores.boxplot(by="ratings", column=["chatGPT_baseline_example", "chatGPT_seen_from_chatGPT_example"], ax=axes)
axes[0].title.set_text("Task+Ex pass rates by rating")
axes[1].title.set_text("Task+Spec(cGPT)+Ex pass rates by rating")
axes[0].xaxis.label.set_visible(False)
axes[1].xaxis.label.set_text("Rating")
fig.suptitle("")

In [None]:
fig.savefig(f"../specification-instruction-paper/media/funcVsGen.pdf", bbox_inches = "tight")

### Table functionality descriptions

In [None]:
rule_ratings = rule_ratings.T

In [None]:
rule_ratings

In [None]:
sa_ratings = rule_ratings.iloc[:38].copy()

In [None]:
pi_ratings = rule_ratings.iloc[38:38+53].copy()

In [None]:
rc_ratings = rule_ratings.iloc[38+53:38+53+24].copy()

In [None]:
hsd_ratings = rule_ratings.iloc[38+53+24:].copy()

In [None]:
for task, ratings in zip(["sa", "pi", "rc", "hsd"], [sa_ratings, pi_ratings, rc_ratings, hsd_ratings]):
    with open(f"./data/{task}/suite/func_desc.pkl", "rb") as file:
        func_desc = pickle.load(file)
        order = {func.replace('"', ""): idx for idx, func in enumerate(ratings.Functionality.tolist())}
        od = collections.OrderedDict(sorted(func_desc.items(), key= lambda item: order[item[0].replace('"', "")]))
        ratings["h_descriptions"] = od.values()

In [None]:
sa_ratings

In [None]:
sa_table = sa_ratings[["Functionality", "h_descriptions", "Description", "Rating"]].style.format(escape="latex")

In [None]:
print(sa_table.hide(axis=0).to_latex())

In [None]:
pi_table = pi_ratings[["Functionality", "h_descriptions", "Description", "Rating"]].style.format(escape="latex")

In [None]:
print(pi_table.hide(axis=0).to_latex())

In [None]:
rc_table = rc_ratings[["Functionality", "h_descriptions", "Description", "Rating"]].style.format(escape="latex")

In [None]:
print(rc_table.hide(axis=0).to_latex())

In [None]:
hsd_ratings

In [None]:
hsd_table = hsd_ratings[["Functionality", "h_descriptions", "Description", "Rating"]].style.format(escape="latex")

In [None]:
print(hsd_table.hide(axis=0).to_latex())

In [None]:
all_ratings = pd.concat([sa_ratings, pi_ratings, rc_ratings, hsd_ratings])

In [None]:
all_ratings.Description.str.split().str.len().describe()

In [None]:
all_ratings.h_descriptions.str.split().str.len().describe()

In [None]:
all_ratings.iloc[90]["h_descriptions"]

In [None]:
pi_ratings.iloc[0].Description

### Table rationale analysis

In [None]:
rat_samples = pd.read_csv("./results/labelled_rat_samples.csv")

In [None]:
rat_samples = rat_samples[["input", "pred", "Explanation correct", "Pred match expl.", "Why no correct", "Correct answer"]]

In [None]:
rat_samples

In [None]:
rat_samples["Explanation correct"].value_counts()

In [None]:
21/40

In [None]:
rat_samples["Pred match expl."].value_counts()

In [None]:
rat_samples["Why no correct"].value_counts()

In [None]:
rat_samples["Correct answer"].value_counts()

In [None]:
rat_samples[rat_samples["Explanation correct"] == "1"]["Correct answer"].value_counts()

In [None]:
rat_samples[rat_samples["Explanation correct"] == "0"]["Correct answer"].value_counts()

In [None]:
rat_samples.groupby("Why no correct")["Correct answer"].value_counts()