In [None]:
%load_ext autoreload
%autoreload 2

import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, accuracy_score
from pathlib import Path

from utils import initialize_seeds

In [None]:
DEV = False

In [None]:
initialize_seeds()

In [None]:
data_path = Path("./hatecheck-data")
results_path = Path("./results/hatecheck/")

In [None]:
hatecheck_df = pd.read_csv(data_path/"test_suite_cases.csv", index_col=0)

In [None]:
! ls {results_path}

In [None]:
results = {}
results["random"] = pd.read_pickle(results_path/"results_BERT_weighted.pkl")[["case_id", "preds", "split"]]
results["unseen"] = pd.read_pickle(results_path/"results_BERT_weighted_unseen.pkl")[["case_id", "preds", "split"]]
results["davidson2017_random"] = pd.read_pickle(results_path/"results_BERT_davidson2017_weighted.pkl")[["case_id", "preds"]]
results["davidson2017_unseen"] = pd.read_pickle(results_path/"results_BERT_davidson2017_weighted_unseen.pkl")[["case_id", "preds"]]

In [None]:
results["founta2018_random"] = pd.read_pickle(results_path/"results_BERT_founta2018_weighted.pkl")[["case_id", "preds"]]
results["founta2018_unseen"] = pd.read_pickle(results_path/"results_BERT_founta2018_weighted_unseen.pkl")[["case_id", "preds"]]

In [None]:
no_fine_tune_results =pd.read_pickle('./results/hatecheck/results_BERT_davidson_and_founta_weighted.pkl')

In [None]:
no_fine_tune_results.rename(columns = {"pred_BERT_davidson2017_weighted": "davidson2017_nofinetune_preds",
                             "pred_BERT_founta2018_weighted": "founta2018_nofinetune_preds"}, inplace=True)

In [None]:
results["founta2018_nofinetune"] = no_fine_tune_results[["case_id", "founta2018_nofinetune_preds"]].copy()
results["davidson2017_nofinetune"] = no_fine_tune_results[["case_id", "davidson2017_nofinetune_preds"]].copy()

In [None]:
results["hatecheck+davidson"] = pd.read_pickle(results_path/"results_BERT_hateCheck+davidson_weighted.pkl")[["case_id", "preds", "split"]]
results["hatecheck+founta"] = pd.read_pickle(results_path/"results_BERT_hateCheck+founta_weighted.pkl")[["case_id", "preds", "split"]]

In [None]:
# merge with hatecheck df
for model in results:
    results[model].rename(columns={"preds": f"{model}_preds", "split": f"{model}_splits"}, inplace=True)
    hatecheck_df = hatecheck_df.merge(results[model], how='left', on='case_id')

In [None]:
hatecheck_df

### Insert majority baseline

In [None]:
hatecheck_df["majority_preds"] = "hateful"

In [None]:
hatecheck_df.head()

## Evalation by class

In [None]:
hatecheck_df

In [None]:
from scipy.stats import binomtest


def get_significance(preds_1, preds_2, labels):
    corrects1 = (np.array(preds_1) == np.array(labels)).astype(int)
    corrects2 = (np.array(preds_2) == np.array(labels)).astype(int)
    diffs = corrects2 - corrects1
    successes = (diffs == 1).sum()
    trials = np.abs(diffs).sum()
    return binomtest(successes, trials)

In [None]:
test_samples = hatecheck_df[hatecheck_df["random_splits"] == "test"]
for model in ["davidson2017_nofinetune", "davidson2017_random", "founta2018_nofinetune",
              "founta2018_random", "random", "hatecheck+davidson", "hatecheck+founta"]:
    preds = test_samples[f"{model}_preds"].tolist()
    labels = test_samples["label_gold"].tolist()
    print(f"Results for {model}:")
    print(classification_report(labels, preds,digits=4))
    print()

In [None]:
for model in ["davidson2017", "founta2018"]:
    print(get_significance(test_samples[f"{model}_nofinetune_preds"].tolist(),
                           test_samples[f"{model}_random_preds"].tolist(),
                           test_samples["label_gold"].tolist()))
    print(get_significance(test_samples[f"{model}_nofinetune_preds"].tolist(),
                           test_samples[f"hatecheck+{model[:-4]}_preds"].tolist(),
                           test_samples["label_gold"].tolist()))
    print(get_significance(test_samples[f"{model}_random_preds"].tolist(),
                           test_samples[f"random_preds"].tolist(),
                           test_samples["label_gold"].tolist()))
    print(get_significance(test_samples[f"{model}_random_preds"].tolist(),
                           test_samples[f"hatecheck+{model[:-4]}_preds"].tolist(),
                           test_samples["label_gold"].tolist()))

In [None]:
print(get_significance(test_samples["davidson2017_random_preds"].tolist(),
                           test_samples["founta2018_random_preds"].tolist(),
                           test_samples["label_gold"].tolist()))

## Evaluation by functionality

In [None]:
for _, group in hatecheck_df.groupby("unseen_splits"):
    print(pd.unique(group.functionality))

In [None]:
with open("./data/held_out_dic.pkl", "rb") as file:
    held_out_dic = pickle.load(file)

In [None]:
held_out_funcs = held_out_dic["hateful_funcs"] + held_out_dic["nonhateful_funcs"]

In [None]:
held_out_dic

In [None]:
held_out_idents = held_out_dic["idents"]

In [None]:
def results_to_df(results, func_col, pred_col, split_col):
    dic = {}
    for group, df in results.groupby(split_col):
        dic[group] = []
        for x in pd.unique(results[func_col]):
            n_cases = df[df[func_col]==x].shape[0]
            if n_cases == 0:
                dic[group].append(None)
            else:
                n_correct = df[(df[func_col]==x)&(df['label_gold']==df[pred_col])].shape[0]
                dic[group].append(n_correct/n_cases)
        dic[group] = pd.Series(dic[group])
        dic[group].name = group
        
    dic["overall"] = []
    for x in pd.unique(results[func_col]):
        n_cases = results[results[func_col]==x].shape[0]
        n_correct = results[(results[func_col]==x)&(results['label_gold']==results[pred_col])].shape[0]
        dic["overall"].append(n_correct/n_cases)
    dic["overall"] = pd.Series(dic["overall"])
    dic["overall"].name = "overall"
        
    # create df from dict
    df = pd.Series(pd.unique(results[func_col]))
    df.name = func_col

    for acc_data in dic:
        df = pd.concat([df, pd.Series(dic[acc_data])], axis =1)

    cols = [func_col, "overall", "train", "val", "test"]
    return df[cols]

In [None]:
def comp_seen_unseen(accuracy_df, col, held_out, split):
    return (accuracy_df[~accuracy_df[col].isin(held_out)][split],
            accuracy_df[accuracy_df[col].isin(held_out)][split])

In [None]:
def compare_results(results, target_df, func, held_out):
    results_dic = {}
    for model in results:
        if model[-6:] == "random":
            splits = ["random"]
        elif model[-6:] == "unseen":
            splits = ["unseen"]
        else:
            splits = ["random", "unseen"]
        for split in splits:
            print(f"Results for {model} on the {split} split:")
            df = results_to_df(target_df, func, f"{model}_preds", f"{split}_splits")
            results_dic[(model,split)] = df
            print(df)
            print(df.describe())
            seen, unseen = comp_seen_unseen(df, func, held_out, "test")
            print(seen)
            print(unseen)
            print(seen.describe())
            print(unseen.describe())
            print()
    return results_dic

In [None]:
func_results = compare_results(results, hatecheck_df, "functionality", held_out_funcs)

## Evaluation by protected group

In [None]:
# create df with only template cases --> number of cases for each identity should be balanced\n",
templ_cases_df = hatecheck_df[hatecheck_df.case_templ.str.contains('IDENTITY')].copy()
templ_cases_df.groupby(templ_cases_df.target_ident).case_id.count()

In [None]:
results["majority"] = hatecheck_df[["case_id", "majority_preds"]]

In [None]:
compare_results(results, templ_cases_df, "target_ident", held_out_idents)

In [None]:
compare_results(results, hatecheck_df.dropna(subset=["target_ident"]), "target_ident", held_out_idents)

## Evaluate 1out results

In [None]:
hatecheck_df = pd.read_csv(data_path/"test_suite_cases.csv", index_col=0)

In [None]:
results_path = Path("./results/hatecheck/leave1out/")

In [None]:
! ls ./results/hatecheck/leave1out/

In [None]:
results_1out = {}
for col in ["functionality", "target_ident"]:
    results_1out[col] = {}
    for model in ["davidson2017", "founta2018", "hateCheck+davidson", "hateCheck+founta"]:
        results_1out[col][model] = {}
        if col=="functionality":
            phenom_set =  pd.unique(hatecheck_df[col])
        else:
            phenom_set = pd.unique(hatecheck_df.dropna(subset=["target_ident"])[col])
        for func in phenom_set:
            results_1out[col][model][func] = pd.read_pickle(results_path/f"results_BERT_{model}_weighted_leaveOut_{func}.pkl")[["case_id", "preds", "split"]]
            results_1out[col][model][func].replace({1: 'hateful', 0: 'non-hateful'}, inplace=True)

### Overall accuracies

In [None]:
def get_accuracies(results, df, model, col, splits=["test"], return_preds=False):
    seen_acc = []
    unseen_preds = []
    unseen_labels = []
    for k, v in results[col][model].items():
        v.rename(columns={"preds": f"{k}_preds", "split": f"{k}_splits"}, inplace=True)
        df_with_preds = df.merge(v , how='left', on='case_id')
        unseen_test_samples = df_with_preds[(df_with_preds[f"{k}_splits"].isin(splits)) & (df_with_preds[col] == k)][["label_gold", f"{k}_preds"]]
        seen_test_samples = df_with_preds[(df_with_preds[f"{k}_splits"].isin(splits)) & (df_with_preds[col] != k)][["label_gold", f"{k}_preds"]]
        seen_acc.append(accuracy_score(seen_test_samples["label_gold"], seen_test_samples[f"{k}_preds"]))
        unseen_preds.extend(unseen_test_samples[f"{k}_preds"])
        unseen_labels.extend(unseen_test_samples["label_gold"])
    avg_seen_acc = np.array(seen_acc).mean()
    unseen_acc = accuracy_score(unseen_labels, unseen_preds)
    if return_preds:
        return unseen_preds, unseen_labels
    return avg_seen_acc, unseen_acc

In [None]:
get_accuracies(results_1out, hatecheck_df, "davidson2017", "functionality")

In [None]:
get_accuracies(results_1out, hatecheck_df, "founta2018", "functionality")

In [None]:
get_accuracies(results_1out, hatecheck_df, "hateCheck+davidson", "functionality")

In [None]:
get_accuracies(results_1out, hatecheck_df, "hateCheck+founta", "functionality")

In [None]:
get_accuracies(results_1out, hatecheck_df, "davidson2017", "target_ident")

In [None]:
get_accuracies(results_1out, hatecheck_df, "founta2018", "target_ident")

In [None]:
get_accuracies(results_1out, hatecheck_df, "hateCheck+davidson", "target_ident")

In [None]:
get_accuracies(results_1out, hatecheck_df, "hateCheck+founta", "target_ident")

### Results by func/ident

In [None]:
def aggregate_results(results, df, model, col):
    seen_funcs = pd.DataFrame()
    held_out_funcs = pd.DataFrame()
    seen_accs = []
    test_samples = {}
    for k, v in results[col][model].items():
        v.rename(columns={"preds": f"{k}_preds", "split": f"{k}_splits"}, inplace=True)
        df_with_preds = df.merge(v , how='left', on='case_id')
        func_df = results_to_df(df_with_preds, col, f"{k}_preds", f"{k}_splits")
        seen_funcs = pd.concat([seen_funcs, func_df[func_df[col] != k]], axis =0) 
        held_out_funcs = pd.concat([held_out_funcs, func_df[func_df[col] == k]], axis =0)
        test_samples[k] = df_with_preds[(df_with_preds[f"{k}_splits"]=="test") & (df_with_preds[col] == k)][["case_id", "label_gold"]]
        
    return seen_funcs, held_out_funcs, test_samples

In [None]:
def display_agg(agg, col, df):
    return agg.groupby(col).agg(['mean','std']).reindex(pd.unique(df[col]))

In [None]:
def get_test_results(test_samples, model, results):
    results_dic = {}
    all_df = pd.DataFrame()
    for k, v in test_samples.items():
        df = results[model].merge(v)
        all_df = pd.concat([all_df, df], axis=0)
        results_dic[k] = (df[f"{model}_preds"] == df["label_gold"]).mean()
    return pd.DataFrame.from_dict(results_dic, orient="index"), accuracy_score(all_df.label_gold, all_df[f"{model}_preds"])

In [None]:
def get_preds_and_labels(test_samples, model, results):
    preds = []
    labels = []
    all_df = pd.DataFrame()
    for k, v in test_samples.items():
        df = results[model].merge(v)
        all_df = pd.concat([all_df, df], axis=0)
        preds.extend(df[f"{model}_preds"])
        labels.extend(df["label_gold"])
    return preds, labels

In [None]:
seen_funcs_davidson, held_out_funcs_davidson, test_samples = aggregate_results(results_1out, hatecheck_df, "davidson2017", "functionality")

In [None]:
display_agg(seen_funcs_davidson, "functionality", hatecheck_df)

In [None]:
held_out_funcs_davidson

In [None]:
preds1, labels1 = get_preds_and_labels(test_samples, "davidson2017_nofinetune", results)
preds2, labels2 = get_accuracies(results_1out, hatecheck_df, "davidson2017", "functionality", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
seen_funcs_hateCheckPlusdavidson, held_out_funcs_hateCheckPlusdavidson, test_samples = aggregate_results(results_1out, hatecheck_df, "hateCheck+davidson", "functionality")

In [None]:
display_agg(seen_funcs_hateCheckPlusdavidson, "functionality", hatecheck_df)

In [None]:
held_out_funcs_hateCheckPlusdavidson

In [None]:
get_test_results(test_samples, "davidson2017_nofinetune", results)

In [None]:
preds1, labels1 = get_preds_and_labels(test_samples, "davidson2017_nofinetune", results)
preds2, labels2 = get_accuracies(results_1out, hatecheck_df, "hateCheck+davidson", "functionality", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
seen_funcs_founta, held_out_funcs_founta, test_samples = aggregate_results(results_1out, hatecheck_df, "founta2018", "functionality")

In [None]:
display_agg(seen_funcs_founta, "functionality", hatecheck_df)

In [None]:
held_out_funcs_founta

In [None]:
get_test_results(test_samples, "founta2018_nofinetune", results)

In [None]:
preds1, labels1 = get_preds_and_labels(test_samples, "founta2018_nofinetune", results)
preds2, labels2 = get_accuracies(results_1out, hatecheck_df, "founta2018", "functionality", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
seen_funcs_hateCheckPlusfounta, held_out_funcs_hateCheckPlusfounta, test_samples = aggregate_results(results_1out, hatecheck_df, "hateCheck+founta", "functionality")

In [None]:
display_agg(seen_funcs_hateCheckPlusfounta, "functionality", hatecheck_df)

In [None]:
held_out_funcs_hateCheckPlusfounta

In [None]:
preds1, labels1 = get_preds_and_labels(test_samples, "founta2018_nofinetune", results)
preds2, labels2 = get_accuracies(results_1out, hatecheck_df, "hateCheck+founta", "functionality", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
preds1, labels1 = get_accuracies(results_1out, hatecheck_df, "founta2018", "functionality", return_preds=True)
preds2, labels2 = get_accuracies(results_1out, hatecheck_df, "hateCheck+founta", "functionality", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
preds1, labels1 = get_accuracies(results_1out, hatecheck_df, "davidson2017", "functionality", return_preds=True)
preds2, labels2 = get_accuracies(results_1out, hatecheck_df, "hateCheck+davidson", "functionality", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
seen_idents_davidson, held_out_idents_davidson, test_samples = aggregate_results(results_1out, hatecheck_df.dropna(subset=["target_ident"]), "davidson2017", "target_ident")

In [None]:
display_agg(seen_idents_davidson, "target_ident", hatecheck_df.dropna(subset=["target_ident"]))

In [None]:
held_out_idents_davidson

In [None]:
preds1, labels1 = get_preds_and_labels(test_samples, "davidson2017_nofinetune", results)
preds2, labels2 = get_accuracies(results_1out, hatecheck_df, "davidson2017", "target_ident", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
seen_idents_hateCheckPlusdavidson, held_out_idents_hateCheckPlusdavidson, test_samples = aggregate_results(results_1out, hatecheck_df.dropna(subset=["target_ident"]), "hateCheck+davidson", "target_ident")

In [None]:
display_agg(seen_idents_hateCheckPlusdavidson, "target_ident", hatecheck_df.dropna(subset=["target_ident"]))

In [None]:
held_out_idents_hateCheckPlusdavidson

In [None]:
get_test_results(test_samples, "davidson2017_nofinetune", results)

In [None]:
preds1, labels1 = get_preds_and_labels(test_samples, "davidson2017_nofinetune", results)
preds2, labels2 = get_accuracies(results_1out, hatecheck_df, "hateCheck+davidson", "target_ident", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
seen_idents_founta, held_out_idents_founta, test_samples = aggregate_results(results_1out, hatecheck_df.dropna(subset=["target_ident"]), "founta2018", "target_ident")

In [None]:
display_agg(seen_idents_founta, "target_ident", hatecheck_df.dropna(subset=["target_ident"]))

In [None]:
held_out_idents_founta

In [None]:
get_test_results(test_samples, "founta2018_nofinetune", results)

In [None]:
preds1, labels1 = get_preds_and_labels(test_samples, "founta2018_nofinetune", results)
preds2, labels2 = get_accuracies(results_1out, hatecheck_df, "founta2018", "target_ident", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
seen_idents_hateCheckPlusfounta, held_out_idents_hateCheckPlusfounta, test_samples = aggregate_results(results_1out, hatecheck_df.dropna(subset=["target_ident"]), "hateCheck+founta", "target_ident")

In [None]:
display_agg(seen_idents_hateCheckPlusfounta, "target_ident", hatecheck_df.dropna(subset=["target_ident"]))

In [None]:
held_out_idents_hateCheckPlusfounta

In [None]:
preds1, labels1 = get_preds_and_labels(test_samples, "founta2018_nofinetune", results)
preds2, labels2 = get_accuracies(results_1out, hatecheck_df, "hateCheck+founta", "target_ident", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
preds1, labels1 = get_accuracies(results_1out, hatecheck_df, "founta2018", "target_ident", return_preds=True)
preds2, labels2 = get_accuracies(results_1out, hatecheck_df, "hateCheck+founta", "target_ident", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
preds1, labels1 = get_accuracies(results_1out, hatecheck_df, "davidson2017", "target_ident", return_preds=True)
preds2, labels2 = get_accuracies(results_1out, hatecheck_df, "hateCheck+davidson", "target_ident", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

## Cluster results

In [None]:
clusters = {}
for func in pd.unique(hatecheck_df.functionality):
    clusters.setdefault(func.split("_")[0], []).append(func)

In [None]:
clusters

In [None]:
results_clusterOut = {}
for model in ["davidson2017", "founta2018", "hateCheck+davidson", "hateCheck+founta"]:
    results_clusterOut[model] = {}
    for cluster in clusters:
        results_clusterOut[model][cluster] = pd.read_pickle(results_path/f"results_BERT_{model}_weighted_leaveOut_{cluster}.pkl")[["case_id", "preds", "split"]]
        results_clusterOut[model][cluster].replace({1: 'hateful', 0: 'non-hateful'}, inplace=True)

### Overall accuracies

In [None]:
def get_accuracies_clusterOut(results, df, model, col, splits=["test"], return_preds=False):
    seen_acc = []
    unseen_preds = []
    unseen_labels = []
    for k, v in results[model].items():
        v.rename(columns={"preds": f"{k}_preds", "split": f"{k}_splits"}, inplace=True)
        df_with_preds = df.merge(v , how='left', on='case_id')
        unseen_test_samples = df_with_preds[(df_with_preds[f"{k}_splits"].isin(splits)) & (df_with_preds[col].isin(clusters[k]))][["label_gold", f"{k}_preds"]]
        seen_test_samples = df_with_preds[(df_with_preds[f"{k}_splits"].isin(splits)) & ~(df_with_preds[col].isin(clusters[k]))][["label_gold", f"{k}_preds"]]
        seen_acc.append(accuracy_score(seen_test_samples["label_gold"], seen_test_samples[f"{k}_preds"]))
        unseen_preds.extend(unseen_test_samples[f"{k}_preds"])
        unseen_labels.extend(unseen_test_samples["label_gold"])
    avg_seen_acc = np.array(seen_acc).mean()
    unseen_acc = accuracy_score(unseen_labels, unseen_preds)
    if return_preds:
        return unseen_preds, unseen_labels
    return avg_seen_acc, unseen_acc

In [None]:
get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "davidson2017", "functionality")

In [None]:
get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "founta2018", "functionality")

In [None]:
get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "hateCheck+davidson", "functionality")

In [None]:
get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "hateCheck+founta", "functionality")

### Results by cluster and func

In [None]:
def aggregate_cluster_results(results, df, model, col):
    seen_funcs = pd.DataFrame()
    held_out_funcs = pd.DataFrame()
    test_samples_cluster = {}
    test_samples = {}
    for k, v in results[model].items():
        v.rename(columns={"preds": f"{k}_preds", "split": f"{k}_splits"}, inplace=True)
        df_with_preds = df.merge(v , how='left', on='case_id')
        func_df = results_to_df(df_with_preds, col, f"{k}_preds", f"{k}_splits")
        seen_funcs = pd.concat([seen_funcs, func_df[~(func_df[col].isin(clusters[k]))]], axis =0) 
        held_out_funcs = pd.concat([held_out_funcs, func_df[(func_df[col].isin(clusters[k]))]], axis =0) 
        test_samples_cluster[k] = df_with_preds[(df_with_preds[f"{k}_splits"]=="test") & (df_with_preds[col].isin(clusters[k]))][["case_id", "label_gold"]]
        for func in clusters[k]:
            test_samples[func] = df_with_preds[(df_with_preds[f"{k}_splits"]=="test") & (df_with_preds[col] == func)][["case_id", "label_gold"]]
    return seen_funcs, held_out_funcs, test_samples_cluster, test_samples

In [None]:
seen_clusters_davidson, held_out_clusters_davidson, test_samples_cluster, test_samples = aggregate_cluster_results(results_clusterOut, hatecheck_df, "davidson2017", "functionality")

In [None]:
display_agg(seen_clusters_davidson, "functionality", hatecheck_df)

In [None]:
held_out_clusters_davidson

In [None]:
get_test_results(test_samples, "davidson2017_nofinetune", results)

In [None]:
get_test_results(test_samples_cluster, "davidson2017_nofinetune", results)

In [None]:
preds1, labels1 = get_preds_and_labels(test_samples, "davidson2017_nofinetune", results)
preds2, labels2 = get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "davidson2017", "functionality", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
preds1, labels1 = get_preds_and_labels(test_samples, "davidson2017_nofinetune", results)
preds2, labels2 = get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "hateCheck+davidson", "functionality", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
preds1, labels1 = get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "davidson2017", "functionality", return_preds=True)
preds2, labels2 = get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "hateCheck+davidson", "functionality", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
results_dic = {}
for k, v in test_samples_cluster.items():
        df = results_clusterOut["davidson2017"][k].merge(v)
        results_dic[k] = (df[f"{k}_preds"] == df["label_gold"]).mean()
pd.DataFrame.from_dict(results_dic, orient="index")

In [None]:
held_out_idents_davidson

In [None]:
seen_idents_davidson, held_out_idents_davidson, test_samples_cluster, test_samples = aggregate_cluster_results(results_clusterOut, hatecheck_df.dropna(subset=["target_ident"]), "davidson2017", "target_ident")

In [None]:
display_agg(seen_idents_davidson, "target_ident", hatecheck_df.dropna(subset=["target_ident"]))

In [None]:
seen_clusters_founta, held_out_clusters_founta, test_samples_cluster, test_samples = aggregate_cluster_results(results_clusterOut, hatecheck_df, "founta2018", "functionality")

In [None]:
display_agg(seen_clusters_founta, "functionality", hatecheck_df)

In [None]:
held_out_clusters_founta

In [None]:
get_test_results(test_samples, "founta2018_nofinetune", results)

In [None]:
get_test_results(test_samples_cluster, "founta2018_nofinetune", results)

In [None]:
preds1, labels1 = get_preds_and_labels(test_samples, "founta2018_nofinetune", results)
preds2, labels2 = get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "founta2018", "functionality", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
preds1, labels1 = get_preds_and_labels(test_samples, "founta2018_nofinetune", results)
preds2, labels2 = get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "hateCheck+founta", "functionality", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
preds1, labels1 = get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "founta2018", "functionality", return_preds=True)
preds2, labels2 = get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "hateCheck+founta", "functionality", return_preds=True)
assert labels1==labels2
print(get_significance(preds1, preds2, labels1))

In [None]:
results_dic = {}
for k, v in test_samples_cluster.items():
        df = results_clusterOut["founta2018"][k].merge(v)
        results_dic[k] = (df[f"{k}_preds"] == df["label_gold"]).mean()
pd.DataFrame.from_dict(results_dic, orient="index")

In [None]:
seen_idents_founta, held_out_idents_founta, test_samples_cluster, test_samples = aggregate_cluster_results(results_clusterOut, hatecheck_df.dropna(subset=["target_ident"]), "founta2018", "target_ident")

In [None]:
display_agg(seen_idents_founta, "target_ident", hatecheck_df.dropna(subset=["target_ident"]))

## Leave1out vs leaveClusterOut

### Overall accuracies

In [None]:
get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "davidson2017", "functionality", splits=["val", "test"])

In [None]:
get_accuracies_clusterOut(results_clusterOut, hatecheck_df, "founta2018", "functionality", splits=["val", "test"])

In [None]:
get_accuracies(results_1out, hatecheck_df, "davidson2017", "functionality", splits=["val", "test"])

In [None]:
get_accuracies(results_1out, hatecheck_df, "founta2018", "functionality", splits=["val", "test"])

### Results by cluster and func

In [None]:
def results_cluster_v_1out(model):
    leave1outResults={}
    leave1clusterResults={}
    for cluster, funcs in clusters.items():
        preds_and_label_by_cluster = pd.DataFrame()
        for func in funcs:
            preds_and_labels = results_1out["functionality"][model][func].merge(results_clusterOut[model][cluster]).merge(hatecheck_df[["case_id", "label_gold", "functionality"]])
            preds_and_label_by_func = preds_and_labels[preds_and_labels.functionality == func]
            preds_and_label_by_cluster = pd.concat([preds_and_label_by_cluster, preds_and_label_by_func.rename(columns={f"{func}_preds": "1out_preds"})], axis=0)
            leave1outResults[func] = (preds_and_label_by_func[f"{func}_preds"] == preds_and_label_by_func["label_gold"]).mean()
            leave1clusterResults[func] = (preds_and_label_by_func[f"{cluster}_preds"] == preds_and_label_by_func["label_gold"]).mean()
        leave1outResults[cluster] = (preds_and_label_by_cluster[f"1out_preds"] == preds_and_label_by_cluster["label_gold"]).mean()
        leave1clusterResults[cluster] = (preds_and_label_by_cluster[f"{cluster}_preds"] == preds_and_label_by_cluster["label_gold"]).mean()
    return pd.DataFrame.from_dict(leave1outResults, orient="index"), pd.DataFrame.from_dict(leave1clusterResults, orient="index")

In [None]:
leave1out_davidson, leave1cluster_davidson = results_cluster_v_1out("davidson2017")

In [None]:
funcs = pd.unique(hatecheck_df.functionality)

In [None]:
leave1out_davidson[leave1out_davidson.index.isin(funcs)]

In [None]:
leave1out_davidson[leave1out_davidson.index.isin(clusters.keys())]

In [None]:
leave1cluster_davidson[leave1cluster_davidson.index.isin(funcs)]

In [None]:
leave1cluster_davidson[leave1cluster_davidson.index.isin(clusters.keys())]

In [None]:
leave1out_founta, leave1cluster_founta = results_cluster_v_1out("founta2018")

In [None]:
leave1out_founta[leave1out_founta.index.isin(funcs)]

In [None]:
leave1out_founta[leave1out_founta.index.isin(clusters.keys())]

In [None]:
leave1cluster_founta[leave1cluster_founta.index.isin(funcs)]

In [None]:
leave1cluster_founta[leave1cluster_founta.index.isin(clusters.keys())]