In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from datasets import load_dataset, load_metric, concatenate_datasets
from utils.results import *
from utils.prompts import *
from data_sets.data_utils import load_hsd_dataset,  get_suite
from transformers import T5TokenizerFast
from scipy.stats import kendalltau
import seaborn as sns
import numpy as np
import pandas as pd
import config
import collections
import pickle
import matplotlib.pyplot as plt

In [None]:
def kendall_pval(x,y):
    return kendalltau(x,y)[1]

In [None]:
tokenizer = T5TokenizerFast.from_pretrained("google/flan-t5-small")

In [None]:
tokenizer.model_max_length = 4096

In [None]:
def get_prompt_length_and_perf(df, test_data, examples_per_label, task, suite=False):
    performances_with_lengths = pd.DataFrame()
    with open(f"./data/{task}/suite/class_to_funcs.pkl", "rb") as file:
            class_to_funcs = pickle.load(file)
    func_to_class = {func: func_class for func_class, funcs in class_to_funcs.items() for func in funcs.keys()}
    with open(f"./data/{task}/suite/func_desc.pkl", "rb") as file:
        human_func_desc = pickle.load(file)
    with open(f"./data/chatGPTgeneratedRules/{task}/rules.json", "r") as file:
        chatGPT_func_desc = json.load(file)
    for method, data in df.groupby(["method", "score"]):
        prompt_perf = pd.DataFrame(data.T.iloc[:-3].mean(1), columns=["avg"])
        from_chatGPT=True if "chatGPT" in method[0] else False
        ask_rule = True if "Rat" in method[0] else False
        add_examples = True if "Ex" in method[0] else False
        if not add_examples:
            output_format = "\nOutput a concise, minimal answer."
            examples=None
        else:
            output_format = ""
            examples = examples_per_label
        if not from_chatGPT:
            func_desc = human_func_desc
        else:
            func_desc = chatGPT_func_desc
        prompts = prompt(task, test_data, method=method[1], func_desc=func_desc, func_to_class=func_to_class,
                                class_to_funcs=class_to_funcs, suite=suite, ask_rule=ask_rule, examples=examples)
        dataset = Dataset.from_list([{"prompt": prompt + output_format} for prompt in prompts])
        tokenized_prompts = dataset.map(lambda x: tokenizer(x["prompt"], truncation=True),
                                        remove_columns=dataset.column_names,
                                        batched=True)
        prompt_perf["length"] = [len(x["input_ids"]) for x in tokenized_prompts]
        prompt_perf["method"] = [method]*len(tokenized_prompts)
        performances_with_lengths = pd.concat([performances_with_lengths, prompt_perf], axis=0)
    return performances_with_lengths

In [None]:
all_prompts = []

## Sentiment analysis

### Dataset

In [None]:
result_path = Path(f"./results/sa/sst2/")

In [None]:
metric = load_metric("glue", "sst2")

In [None]:
dataset = load_dataset("glue", "sst2")
dataset_test = dataset["validation"]
dataset_train = dataset["train"]

In [None]:
examples_per_label = [dataset_train.filter(lambda x: x["label"] == label) for label in [0, 1]]

In [None]:
df, preds = dataset_hits_df("sa", result_path, dataset_test, metric)

In [None]:
performances_with_lengths = get_prompt_length_and_perf(df, dataset_test, examples_per_label, "sa")

In [None]:
performances_with_lengths.plot(kind="scatter", x="length", y="avg")

In [None]:
# fig, ax = plt.subplots(figsize=(25,8))
# g = sns.scatterplot(x='length', y='avg', hue='method', data=performances_with_lengths, ax=ax)
# sns.move_legend(g,"lower center", bbox_to_anchor=(.5, 1), ncol=7, title=None, frameon=False)

In [None]:
performances_with_lengths.corr(method="kendall").loc["avg", "length"]

In [None]:
performances_with_lengths.corr(method=kendall_pval).loc["avg", "length"]

In [None]:
for m, data in performances_with_lengths.groupby("method"):
    print(m)
    print(data.corr(method="kendall").loc["avg", "length"])
    print("====")

In [None]:
all_prompts.append(performances_with_lengths)

### Suite

In [None]:
path = Path("results/sa/suite/")

In [None]:
suite_test = get_suite(config.sa_path)["test"]

In [None]:
df, test_idxs =suite_hits_df("sa", path, suite_test)

In [None]:
keep_items = df.columns[~df.isna().any()].tolist()

In [None]:
df = df.dropna(1)

In [None]:
suite_test = suite_test.select(indices =[x[0] for x in list(test_idxs.values())])

In [None]:
suite_test = suite_test.select(indices =keep_items[:-3])

In [None]:
performances_with_lengths = get_prompt_length_and_perf(df, suite_test, examples_per_label, "sa", suite=True)

In [None]:
performances_with_lengths.plot(kind="scatter", x="length", y="avg")

In [None]:
# fig, ax = plt.subplots(figsize=(25,8))
# g = sns.scatterplot(x='length', y='avg', hue='method', data=performances_with_lengths, ax=ax)
# sns.move_legend(g,"lower center", bbox_to_anchor=(.5, 1), ncol=7, title=None, frameon=False)

In [None]:
performances_with_lengths.corr(method="kendall").loc["avg", "length"]

In [None]:
performances_with_lengths.corr(method=kendall_pval).loc["avg", "length"]

In [None]:
for m, data in performances_with_lengths.groupby("method"):
    print(m)
    print(data.corr(method="kendall").loc["avg", "length"])
    print("====")

In [None]:
all_prompts.append(performances_with_lengths)

## Paraphrase identification

### Dataset

In [None]:
result_path = Path(f"./results/pi/qqp/")

In [None]:
dataset = load_dataset("glue", "qqp")
dataset_test = dataset["validation"]
dataset_train = dataset["train"]

In [None]:
metric = load_metric("glue","qqp")

In [None]:
examples_per_label = [dataset_train.filter(lambda x: x["label"] == label) for label in [0, 1]]

In [None]:
df, preds = dataset_hits_df("pi", result_path, dataset_test, metric)

In [None]:
performances_with_lengths = get_prompt_length_and_perf(df, dataset_test, examples_per_label, "pi")

In [None]:
performances_with_lengths.plot(kind="scatter", x="length", y="avg")

In [None]:
# fig, ax = plt.subplots(figsize=(25,8))
# g = sns.scatterplot(x='length', y='avg', hue='method', data=performances_with_lengths, ax=ax)
# sns.move_legend(g,"lower center", bbox_to_anchor=(.5, 1), ncol=7, title=None, frameon=False)

In [None]:
performances_with_lengths.corr(method="kendall").loc["avg", "length"]

In [None]:
performances_with_lengths.corr(method=kendall_pval).loc["avg", "length"]

In [None]:
for m, data in performances_with_lengths.groupby("method"):
    print(m)
    print(data.corr(method="kendall").loc["avg", "length"])
    print("====")

In [None]:
all_prompts.append(performances_with_lengths)

### Suite

In [None]:
path = Path("results/pi/suite/")

In [None]:
suite_test = get_suite(config.pi_path)["test"]

In [None]:
df, test_idxs =suite_hits_df("pi", path, suite_test)

In [None]:
keep_items = df.columns[~df.isna().any()].tolist()

In [None]:
df = df.dropna(1)

In [None]:
suite_test = suite_test.select(indices =[x[0] for x in list(test_idxs.values())])

In [None]:
suite_test = suite_test.select(indices =keep_items[:-3])

In [None]:
performances_with_lengths = get_prompt_length_and_perf(df, suite_test, examples_per_label, "pi", suite=True)

In [None]:
performances_with_lengths.plot(kind="scatter", x="length", y="avg")

In [None]:
# fig, ax = plt.subplots(figsize=(25,8))
# g = sns.scatterplot(x='length', y='avg', hue='method', data=performances_with_lengths, ax=ax)
# sns.move_legend(g,"lower center", bbox_to_anchor=(.5, 1), ncol=7, title=None, frameon=False)

In [None]:
performances_with_lengths.corr(method="kendall").loc["avg", "length"]

In [None]:
performances_with_lengths.corr(method=kendall_pval).loc["avg", "length"]

In [None]:
for m, data in performances_with_lengths.groupby("method"):
    print(m)
    print(data.corr(method="kendall").loc["avg", "length"])
    print("====")

In [None]:
all_prompts.append(performances_with_lengths)

## Reading comprehension

### Dataset

In [None]:
result_path = Path(f"./results/rc/squad/")

In [None]:
dataset = load_dataset("squad")
dataset_test = dataset["validation"]
dataset_train = dataset["train"]

In [None]:
metric = load_metric("squad")

In [None]:
examples_per_label = dataset_train

In [None]:
df, preds = dataset_hits_df("rc", result_path, dataset_test, metric, label_col="answers")

In [None]:
performances_with_lengths = get_prompt_length_and_perf(df, dataset_test, examples_per_label, "rc")

In [None]:
performances_with_lengths.plot(kind="scatter", x="length", y="avg")

In [None]:
# fig, ax = plt.subplots(figsize=(25,8))
# g = sns.scatterplot(x='length', y='avg', hue='method', data=performances_with_lengths, ax=ax)
# sns.move_legend(g,"lower center", bbox_to_anchor=(.5, 1), ncol=7, title=None, frameon=False)

In [None]:
performances_with_lengths.corr(method="kendall").loc["avg", "length"]

In [None]:
performances_with_lengths.corr(method=kendall_pval).loc["avg", "length"]

In [None]:
for m, data in performances_with_lengths.groupby("method"):
    print(m)
    print(data.corr(method="kendall").loc["avg", "length"])
    print("====")

In [None]:
all_prompts.append(performances_with_lengths)

### Suite

In [None]:
path = Path("results/rc/suite/")

In [None]:
suite_test = get_suite(config.rc_path)["test"]

In [None]:
df, test_idxs =suite_hits_df("rc", path, suite_test)

In [None]:
keep_items = df.columns[~df.isna().any()].tolist()

In [None]:
df = df.dropna(1)

In [None]:
suite_test = suite_test.select(indices =[x[0] for x in list(test_idxs.values())])

In [None]:
suite_test = suite_test.select(indices =keep_items[:-3])

In [None]:
performances_with_lengths = get_prompt_length_and_perf(df, suite_test, examples_per_label, "rc", suite=True)

In [None]:
performances_with_lengths.plot(kind="scatter", x="length", y="avg")

In [None]:
# fig, ax = plt.subplots(figsize=(25,8))
# g = sns.scatterplot(x='length', y='avg', hue='method', data=performances_with_lengths, ax=ax)
# sns.move_legend(g,"lower center", bbox_to_anchor=(.5, 1), ncol=7, title=None, frameon=False)

In [None]:
performances_with_lengths.corr(method="kendall").loc["avg", "length"]

In [None]:
performances_with_lengths.corr(method=kendall_pval).loc["avg", "length"]

In [None]:
for m, data in performances_with_lengths.groupby("method"):
    print(m)
    print(data.corr(method="kendall").loc["avg", "length"])
    print("====")

In [None]:
all_prompts.append(performances_with_lengths)

## Hate speech detection

### Dataset

In [None]:
davidson_result_path = Path(f"./results/hsd/davidson2017/")
davidson = load_hsd_dataset("davidson2017")
davidson_test = davidson["test"]
davidson_train = davidson["train"]

In [None]:
founta_result_path = Path(f"./results/hsd/founta2018/")
founta = load_hsd_dataset("founta2018")
founta_test = founta["test"]
founta_train = founta["train"]

In [None]:
davidson_examples_per_label = [davidson_train.filter(lambda x: x["label"] == label) for label in [0, 1]]
founta_examples_per_label = [founta_train.filter(lambda x: x["label"] == label) for label in [0, 1]]
dataset_train = concatenate_datasets([davidson_train, founta_train])
examples_per_label = [dataset_train.filter(lambda x: x["label"] == label) for label in [0, 1]]

In [None]:
metric = load_metric("glue","qqp")

In [None]:
df, preds = dataset_hits_df("hsd", davidson_result_path, davidson_test, metric)

In [None]:
performances_with_lengths = get_prompt_length_and_perf(df, davidson_test, davidson_examples_per_label, "hsd")

In [None]:
performances_with_lengths.plot(kind="scatter", x="length", y="avg")

In [None]:
# fig, ax = plt.subplots(figsize=(25,8))
# g = sns.scatterplot(x='length', y='avg', hue='method', data=performances_with_lengths, ax=ax)
# sns.move_legend(g,"lower center", bbox_to_anchor=(.5, 1), ncol=7, title=None, frameon=False)

In [None]:
performances_with_lengths.corr(method="kendall").loc["avg", "length"]

In [None]:
performances_with_lengths.corr(method=kendall_pval).loc["avg", "length"]

In [None]:
for m, data in performances_with_lengths.groupby("method"):
    print(m)
    print(data.corr(method="kendall").loc["avg", "length"])
    print("====")

In [None]:
all_prompts.append(performances_with_lengths)

In [None]:
df, preds = dataset_hits_df("hsd", founta_result_path, founta_test, metric)

In [None]:
performances_with_lengths = get_prompt_length_and_perf(df, founta_test, founta_examples_per_label, "hsd")

In [None]:
performances_with_lengths.plot(kind="scatter", x="length", y="avg")

In [None]:
# fig, ax = plt.subplots(figsize=(25,8))
# g = sns.scatterplot(x='length', y='avg', hue='method', data=performances_with_lengths, ax=ax)
# sns.move_legend(g,"lower center", bbox_to_anchor=(.5, 1), ncol=7, title=None, frameon=False)

In [None]:
performances_with_lengths.corr(method="kendall").loc["avg", "length"]

In [None]:
performances_with_lengths.corr(method=kendall_pval).loc["avg", "length"]

In [None]:
for m, data in performances_with_lengths.groupby("method"):
    print(m)
    print(data.corr(method="kendall").loc["avg", "length"])
    print("====")

In [None]:
all_prompts.append(performances_with_lengths)

### Suite

In [None]:
path = Path("results/hsd/suite/")

In [None]:
suite_test = get_suite(config.hatecheck_path, hateCheck=True)["test"]

In [None]:
df, test_idxs =suite_hits_df("hsd", path, suite_test)

In [None]:
suite_test = suite_test.rename_column("test_case", "text")

In [None]:
performances_with_lengths = get_prompt_length_and_perf(df, suite_test, examples_per_label, "hsd", suite=True)

In [None]:
performances_with_lengths.plot(kind="scatter", x="length", y="avg")

In [None]:
# fig, ax = plt.subplots(figsize=(25,8))
# g = sns.scatterplot(x='length', y='avg', hue='method', data=performances_with_lengths, ax=ax)
# sns.move_legend(g,"lower center", bbox_to_anchor=(.5, 1), ncol=7, title=None, frameon=False)

In [None]:
performances_with_lengths.corr(method="kendall").loc["avg", "length"]

In [None]:
performances_with_lengths.corr(method=kendall_pval).loc["avg", "length"]

In [None]:
for m, data in performances_with_lengths.groupby("method"):
    print(m)
    print(data.corr(method="kendall").loc["avg", "length"])
    print("====")

In [None]:
all_prompts.append(performances_with_lengths)

## Aggregate

In [None]:
for prompts in all_prompts:
    print(prompts.groupby("method").agg(max))

In [None]:
all_prompts_df = pd.concat(all_prompts)

In [None]:
all_prompts_df.plot(kind="scatter", x="length", y="avg")

In [None]:
# fig, ax = plt.subplots(figsize=(25,8))
# g = sns.scatterplot(x='length', y='avg', hue='method', data=performances_with_lengths, ax=ax)
# sns.move_legend(g,"lower center", bbox_to_anchor=(.5, 1), ncol=7, title=None, frameon=False)

In [None]:
all_prompts_df.corr(method="kendall").loc["avg", "length"]

In [None]:
all_prompts_df.corr(method=kendall_pval).loc["avg", "length"]

In [None]:
for m, data in all_prompts_df.groupby("method"):
    print(m)
    print(data.corr(method="kendall").loc["avg", "length"])
    print("====")

In [None]:
all_prompts_df.groupby("method")["length"].mean().sort_values()