In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from datasets import load_dataset, concatenate_datasets
import matplotlib.transforms as transforms
from matplotlib import colormaps
import numpy as np
import os
import krippendorff
import seaborn as sns
from pathlib import Path
from ast import literal_eval
from utils.util import radar_factory
from scipy.stats import pearsonr
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.preprocessing import normalize
from statsmodels.formula.api import mixedlm
from utils.util import initialize_seeds
import json

In [None]:
persona_cat_dict = json.load(open("./data/persona_cat.json", "r"))

In [None]:
n_cats = len(np.unique(list(persona_cat_dict.values())))

In [None]:
markdict={cat: "o" if cat != "control" else "x" for cat in np.unique(list(persona_cat_dict.values()))}

In [None]:
markdict["control"] = "x"

In [None]:
results_path= Path("./results/zero/")
control_results = Path("./results/control/zero/")

In [None]:
models = os.listdir(results_path)
results = {model: pd.read_csv(results_path/model/"off_scores.csv", index_col=0) for model in models}

In [None]:
rename_model = {
    "gpt-4-0125-preview": "GPT-4",
    "gpt-3.5-turbo-0125": "GPT-3.5",
    "Mixtral-8x7B-Instruct-v0.1": "Mixtral",
    "zephyr-7b-beta": "Zephyr",
    "Mistral-7B-Instruct-v0.2": "Mistral-inst",
    "gemma-7b-it": "Gemma-7b-inst",
    "gemma-2b-it": "Gemma-2b-inst"
}

In [None]:
def get_results_df(result_csv, aggregate=True):
    models = os.listdir(results_path)
    results = {model: pd.read_csv(results_path/model/result_csv, index_col=0) for model in models}

    all_results_dic = {}
    for model, df in results.items():
        if "attitude_scores" in result_csv:
            results[model] = results[model].iloc[:,:5]
        elif "attitude" in result_csv:
            for column in df.columns:
                results[model][column] = results[model][column].str.replace(" ", ",")
                results[model][column] = results[model][column].apply(literal_eval)
        for persona in results[model].index.tolist():
            if "attitude" in result_csv and "scores" not in result_csv:
                all_results =  np.concatenate(results[model].loc[persona].values.tolist(),axis=0)
                if aggregate: all_results = all_results.mean()
                all_results_dic.setdefault(model, {})[persona] = all_results
            else:
                all_results =  np.array(results[model].loc[persona].values.tolist())
                all_results_dic.setdefault(model, {})[persona] = all_results
    
    results_df = pd.DataFrame.from_dict(all_results_dic)
    
    results_df["persona_cat"] =  [persona_cat_dict[persona] for persona in results_df.index]
    
    results = {model: pd.read_csv(control_results/model/result_csv, index_col=0) for model in models}
    
    all_results_dic = {}
    for model, df in results.items():
        if "attitude_scores" in result_csv:
            results[model] = results[model].iloc[:,:5]
        elif "attitude" in result_csv:
            for column in df.columns:
                results[model][column] = results[model][column].str.replace(" ", ",")
                results[model][column] = results[model][column].apply(literal_eval)
        for persona in results[model].index.tolist():
            if "attitude" in result_csv and "scores" not in result_csv:
                all_results =  np.concatenate(results[model].loc[persona].values.tolist(),axis=0)
                if aggregate: all_results = all_results.mean()
                all_results_dic.setdefault(model, {})[persona] = all_results
            else:
                all_results =  np.array(results[model].loc[persona].values.tolist())
                all_results_dic.setdefault(model, {})[persona] = all_results
    
    results_df_control = pd.DataFrame.from_dict(all_results_dic); results_df
    
    results_df_control["persona_cat"] = "control"

    return pd.concat([results_df, results_df_control], axis=0).rename(columns=rename_model)

In [None]:
def get_correlations(df):
    rho = df.corr()
    pval = df.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
    p = pval.map(lambda x: ''.join(['*' for t in [.05, .01, .001] if x<=t]))
    return (rho.round(2).astype(str) + p)

In [None]:
def average_correlation(answers):
    corrs = pd.DataFrame(answers).corr(method="kendall").fillna(1).to_numpy()
    return corrs[np.triu_indices(corrs.shape[0], k=1)].mean()

In [None]:
all_hits_stacked = []

In [None]:
all_refusals_stacked = []

In [None]:
performance_std_dfs = []

In [None]:
refusal_std_dfs = []

## Attitudes

### Intra-persona consistency

In [None]:
att_answers = {model: pd.read_csv(results_path/model/"attitude_answers.csv", index_col=0) for model in models}

In [None]:
intra_alphas = {}
all_answers_dic = {}
for model, df in tqdm(att_answers.items()):
    for column in df.columns:
        att_answers[model][column] = att_answers[model][column].str.replace(" ", ",")
        att_answers[model][column] = att_answers[model][column].apply(literal_eval)
    for persona in tqdm(att_answers[model].index.tolist()):
        all_answers =  np.concatenate(att_answers[model].loc[persona].values.tolist(),axis=0)
        all_answers_dic.setdefault(model, {})[persona] = all_answers.mean(1)
        # intra_alphas.setdefault(model, {})[persona] = krippendorff.alpha(reliability_data=all_answers.T, level_of_measurement="ordinal", value_domain=list(range(1,6)))
        intra_alphas.setdefault(model, {})[persona] = all_answers.T.std(0).mean()
        # intra_alphas.setdefault(model, {})[persona] = average_correlation(all_answers)

In [None]:
intra_df = pd.DataFrame.from_dict(intra_alphas)

In [None]:
intra_df.corr(method="kendall")

In [None]:
intra_df["avg"] = intra_df.mean(1)

In [None]:
intra_df.sort_values("avg")

In [None]:
intra_df["persona_cat"] = [persona_cat_dict[persona] for persona in intra_df.index]

In [None]:
att_answers = {model: pd.read_csv(control_results/model/"attitude_answers.csv", index_col=0) for model in models}

In [None]:
intra_alphas = {}
all_answers_control_dic = {}
for model, df in att_answers.items():
    for column in df.columns:
        att_answers[model][column] = att_answers[model][column].str.replace(" ", ",")
        att_answers[model][column] = att_answers[model][column].apply(literal_eval)
    for persona in att_answers[model].index.tolist():
        all_answers=  np.concatenate(att_answers[model].loc[persona].values.tolist(),axis=0)
        all_answers_control_dic.setdefault(model, {})[persona] = all_answers.mean(1)
        # intra_alphas.setdefault(model, {})[persona] = krippendorff.alpha(reliability_data=all_answers.T, level_of_measurement="ordinal", value_domain=list(range(1,6)))
        intra_alphas.setdefault(model, {})[persona] = all_answers.T.std(0).mean()
        # intra_alphas.setdefault(model, {})[persona] = average_correlation(all_answers)


In [None]:
intra_df_control = pd.DataFrame.from_dict(intra_alphas)

In [None]:
intra_df_control.corr(method="kendall")

In [None]:
intra_df_control["avg"] = intra_df_control.mean(1)

In [None]:
intra_df_control.sort_values("avg")

In [None]:
intra_df_control["persona_cat"] = "control"

In [None]:
intra_df = pd.concat([intra_df, intra_df_control], axis=0)

In [None]:
intra_df = intra_df.rename(columns=rename_model)

In [None]:
intra_df = intra_df[list(rename_model.values()) + ["persona_cat"]]

In [None]:
intra_df_stacked = intra_df[intra_df.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Avg. Std. Dev.'})

In [None]:
intra_df_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in intra_df_stacked["index"]]

In [None]:
intra_df.sort_values("GPT-4")

In [None]:
intra_df_stacked

In [None]:
no_control = intra_df_stacked[~intra_df_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  intra_df_stacked[intra_df_stacked["persona_cat"] == "control"]
only_empty = intra_df_stacked[intra_df_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Avg. Std. Dev.", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Avg. Std. Dev.", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Avg. Std. Dev.", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
ax1.set_xlabel("")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
ax1.get_figure().savefig("../persona-biases-paper/media/paraphrase_variation.pdf", bbox_inches="tight")

In [None]:
intra_df.boxplot(by="persona_cat", rot=90, figsize=(15,15))

In [None]:
intra_df.mean(0, numeric_only=True).sort_values()

LLM leaderboard: 

gemma-2b-it                   42.75

gemma-7b-it                   53.56

zephyr-7b-beta                61.95

Mistral-7B-Instruct-v0.2      65.71

Mixtral-8x7B-Instruct-v0.1    72.62

In [None]:
intra_df.corr(numeric_only=True, method="kendall")

In [None]:
intra_df["avg"] = intra_df.mean(1, numeric_only=True)

In [None]:
intra_df.sort_values("avg")

In [None]:
intra_df.idxmax()

In [None]:
intra_df.idxmin()

In [None]:
pos_df = pd.DataFrame(index=intra_df.index)
for col in intra_df.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = intra_df.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(intra_df)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df.sort_values("avg")

### Intra-persona score consistency

In [None]:
data = json.load(open("./data/attitude_questions.json", "r"))

In [None]:
all_answer_paraphrases = get_results_df("attitude_answers.csv", aggregate=False)

In [None]:
def answers_to_scores(answers):
    answers = answers.T
    answer_sets = {}
    for answer_set in answers:
        idx = 0
        for attitudes in data:
            ss = answer_set[idx:idx + len(attitudes["statements"])]
            idx += len(attitudes["statements"])
            for rev in attitudes["reverse"]:
                ss[rev] = 6 - ss[rev]
            answer_sets.setdefault(attitudes["attitude"], []).append(ss.mean())
    for attitude, scores in answer_sets.items():
        # answer_sets[attitude] = np.max(scores) - np.min(scores)
        answer_sets[attitude] = np.std(scores)
    return answer_sets

In [None]:
all_answer_paraphrases = all_answer_paraphrases.iloc[:,:-1].map(answers_to_scores)

In [None]:
all_answer_paraphrases = all_answer_paraphrases[[model for model in rename_model.values()]]

In [None]:
score_std_by_model = {}
for att in all_answer_paraphrases.iloc[0,0].keys():
    for col in all_answer_paraphrases.columns:
        score_std_by_model.setdefault(col, {})[att] = all_answer_paraphrases[col].map(lambda x: x[att])

In [None]:
df = pd.DataFrame()
for model, dic in score_std_by_model.items():
    temp_df = pd.DataFrame.from_dict(dic)
    temp_df["model"] = model
    temp_df["persona_cat"] = [persona_cat_dict[x] if x in persona_cat_dict else "control" for x in temp_df.index]
    df = pd.concat([df, temp_df], axis=0)

In [None]:
sns.violinplot(df,x="model", y="freedom of speech")

In [None]:
sns.violinplot(df,x="model", y="harm of hate speech")

In [None]:
sns.violinplot(df,x="model", y="racist beliefs")

In [None]:
sns.violinplot(df,x="model", y="traditionalism")

In [None]:
sns.violinplot(df,x="model", y="language purism")

In [None]:
sns.violinplot(df,x="model", y="empathy")

In [None]:
sns.violinplot(df,x="model", y="altruism")

In [None]:
df

In [None]:
avg_std = df.groupby("model").mean(numeric_only=True); avg_std

In [None]:
# personas_range = pd.DataFrame.from_dict({x: df.max(0, numeric_only=True) - df.min(0, numeric_only=True) for x, df in att_scores.items()}).T.rename(index=rename_model)

In [None]:
att_scores =  {model: pd.read_csv(results_path/model/"attitude_scores.csv", index_col=0) for model in models}
att_scores_control = {model: pd.read_csv(control_results/model/"attitude_scores.csv", index_col=0) for model in models}
att_scores = {model: pd.concat([att_scores[model], att_scores_control[model]], axis=0) for model in models}

In [None]:
personas_std = pd.DataFrame.from_dict({x: df.std(0, numeric_only=True) for x, df in att_scores.items()}).T.rename(index=rename_model).loc[avg_std.index]

In [None]:
personas_std

In [None]:
std_ratio = personas_std / avg_std

In [None]:
std_ratio["avg"] = std_ratio.mean(1)

In [None]:
std_ratio.loc["avg"] = std_ratio.mean(0)

In [None]:
std_ratio

### Inter-persona consistency

In [None]:
all_answers_df = pd.DataFrame.from_dict(all_answers_dic)

In [None]:
all_answers_df["persona_cat"] = [persona_cat_dict[persona] for persona in all_answers_df.index]

In [None]:
all_answers_df_control = pd.DataFrame.from_dict(all_answers_control_dic)

In [None]:
all_answers_df_control["persona_cat"] = "control"

In [None]:
all_answers_df = pd.concat([all_answers_df, all_answers_df_control], axis=0)

In [None]:
inter_alphas = {}
for model in all_answers_df.columns[:-1]:
    # inter_alphas[model] = krippendorff.alpha(reliability_data=np.vstack(all_answers_df[model].to_numpy()), level_of_measurement="interval")
    # inter_alphas[model] = average_correlation(np.vstack(all_answers_df[model].to_numpy()).T)
    inter_alphas[model] = np.vstack(all_answers_df[model].to_numpy()).std(0).mean()

In [None]:
pd.Series(inter_alphas).sort_values()

In [None]:
att_scores =  {model: pd.read_csv(results_path/model/"attitude_scores.csv", index_col=0) for model in models}

In [None]:
att_scores_control = {model: pd.read_csv(control_results/model/"attitude_scores.csv", index_col=0) for model in models}

In [None]:
for model in att_scores.keys():
    print(att_scores[model].iloc[:,:-1].std()  /att_scores_control[model].std())

In [None]:
inter_score_alphas = {}
for model, df in att_scores.items():
    personas_scores = df.iloc[:,:-1].to_numpy()
    control_scores = att_scores_control[model].to_numpy()
    scores = np.vstack([personas_scores, control_scores])
    # inter_score_alphas[model] = krippendorff.alpha(reliability_data=scores, level_of_measurement="ordinal")
    # inter_score_alphas[model] = average_correlation(scores.T)
    inter_score_alphas[model] = np.vstack(all_answers_df[model].to_numpy()).std(0).mean()

In [None]:
pd.Series(inter_score_alphas).sort_values()

In [None]:
mean_and_std = {}
means = {}
stds = {}
for model, df in att_scores.items():
    personas_scores = df.iloc[:,:-1].to_numpy()
    control_scores = att_scores_control[model].to_numpy()
    scores = np.vstack([personas_scores, control_scores])
    mean = scores.mean(0).round(2)
    std = scores.std(0).round(2)
    means[model] = mean
    stds[model] = std
    mean_and_std[model] = [f"{m} ({s})"for m, s in zip(mean,std)]

In [None]:
means_df = pd.DataFrame.from_dict(means, orient="index", columns=att_scores_control["gpt-3.5-turbo-0125"].columns)

In [None]:
means_df

In [None]:
stds_df = pd.DataFrame.from_dict(stds, orient="index", columns=att_scores_control["gpt-3.5-turbo-0125"].columns)

In [None]:
stds_df

In [None]:
pd.DataFrame.from_dict(mean_and_std, orient="index", columns=att_scores_control["gpt-3.5-turbo-0125"].columns)

In [None]:
### star chart here

In [None]:
cmap = colormaps.get_cmap("Paired")

In [None]:
att_scores = {model: att_scores[model] for model in rename_model.keys()}

In [None]:
N = 7
theta = radar_factory(N, frame='polygon')

fig, axs = plt.subplots(figsize=(8, 16), nrows=4, ncols=2,
                        subplot_kw=dict(projection='radar'))
theta = radar_factory(7)
fig.subplots_adjust(wspace=0.50, hspace=0.05)


for idx, model in enumerate(att_scores.keys()):
 
    scores = att_scores[model].iloc[:,:-1]
    scores_control = att_scores_control[model]
    scores["control"] = False
    scores_control["control"] = True
    scores = pd.concat([scores, scores_control], axis=0)
    
    data = scores
    ax = axs[idx//2, idx%2]
    
    # Plot the four cases from the example data on separate axes
    ax.set_rgrids([1, 2, 3, 4, 5])
    ax.set_title(rename_model[model], weight='bold', size='medium', position=(0.5, 1.1),
                 horizontalalignment='center', verticalalignment='center')
    for t, col in zip(theta, data.columns[:-1]):
        points = data[~data.control]
        control_points = data[data.control]
        ax.plot([t]*len(points), points[col], "o", ms=4, color="b")
        ax.plot([t]*len(control_points), control_points[col], "x", ms=4, color="r")
    ax.plot(theta, data.iloc[:,:-1].loc["empty"], color="orange")
    ax.fill(theta, data.iloc[:,:-1].loc["empty"], alpha=0.25, label='_nolegend_', color="orange")
    # ax.plot(theta, data.iloc[:,:-1].min(), color="green")
    # ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
    # ax.plot(theta, data.iloc[:,:-1].max(), color="red")
    # ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
    ax.set_varlabels(data.columns[:-1])
    ax.set_ylim(0, 5)
    
    # add legend relative to top-left plot
    legend = None
    ax.spines['polar'].set_visible(False)
axs[3,1].axis("off")
labels = ("Personas", "Control personas")
fig.legend(labels,  loc='lower right', bbox_to_anchor=(0.8, 0.25))
# legend = axs[1,3].legend(labels, loc=(0.9, .95),
                      # labelspacing=0.1, fontsize='small')
plt.show()

In [None]:
fig.savefig(f"../persona-biases-paper/media/attitudes.pdf", bbox_inches = "tight")

In [None]:
att_dfs = {}
for model in att_scores.keys():
    temp_df = att_scores[model].copy()
    for column in temp_df.columns[:-1]:
        partial_df = pd.concat([temp_df[column], att_scores_control[model][column]], axis=0)
        if column in att_dfs:
            att_dfs[column][model] = partial_df
        else:
            att_dfs[column] = pd.DataFrame(partial_df).rename(columns={column:model})

In [None]:
att_dfs["freedom of speech"].corr(numeric_only=True, method="kendall")

In [None]:
for att, df in att_dfs.items():
    att_dfs[att]["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in df.index]

In [None]:
for group, df in att_dfs["freedom of speech"].groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group)
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = att_dfs["freedom of speech"].groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        cat_means["avg"] = cat_means.mean(1, numeric_only=True)
        print(cat_means.sort_values("avg"))
        print("=========================")

In [None]:
df = att_dfs["freedom of speech"]
df_stacked =df[df.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Value'})
df_stacked["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in df_stacked["index"]]

In [None]:
att_dfs["freedom of speech"]["avg"] = att_dfs["freedom of speech"].mean(1, numeric_only=True)

In [None]:
att_dfs["freedom of speech"].sort_values("avg")

In [None]:
att_dfs["altruism"].corr(numeric_only=True, method="kendall")

In [None]:
for group, df in att_dfs["altruism"].groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group)
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = att_dfs["altruism"].groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        cat_means["avg"] = cat_means.mean(1, numeric_only=True)
        print(cat_means.sort_values("avg"))
        print("=========================")

In [None]:
df = att_dfs["altruism"]
df_stacked =df[df.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Value'})
df_stacked["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in df_stacked["index"]]

In [None]:
att_dfs["altruism"]["avg"] = att_dfs["altruism"].mean(1, numeric_only=True)

In [None]:
att_dfs["altruism"].sort_values("avg")

In [None]:
att_dfs["empathy"].corr(numeric_only=True, method="kendall")

In [None]:
for group, df in att_dfs["empathy"].groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group)
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = att_dfs["empathy"].groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        cat_means["avg"] = cat_means.mean(1, numeric_only=True)
        print(cat_means.sort_values("avg"))
        print("=========================")

In [None]:
df = att_dfs["empathy"]
df_stacked =df[df.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Value'})
df_stacked["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in df_stacked["index"]]

In [None]:
att_dfs["empathy"]["avg"] = att_dfs["empathy"].mean(1, numeric_only=True)

In [None]:
att_dfs["empathy"].sort_values("avg")

In [None]:
att_dfs["language purism"].corr(numeric_only=True, method="kendall")

In [None]:
for group, df in att_dfs["language purism"].groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group)
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = att_dfs["language purism"].groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        cat_means["avg"] = cat_means.mean(1, numeric_only=True)
        print(cat_means.sort_values("avg"))
        print("=========================")

In [None]:
df = att_dfs["language purism"]
df_stacked =df[df.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Value'})
df_stacked["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in df_stacked["index"]]

In [None]:
att_dfs["language purism"]["avg"] = att_dfs["language purism"].mean(1, numeric_only=True)

In [None]:
att_dfs["language purism"].sort_values("avg")

In [None]:
att_dfs["traditionalism"].corr(numeric_only=True, method="kendall")

In [None]:
for group, df in att_dfs["traditionalism"].groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group)
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = att_dfs["traditionalism"].groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        cat_means["avg"] = cat_means.mean(1, numeric_only=True)
        print(cat_means.sort_values("avg"))
        print("=========================")

In [None]:
df = att_dfs["traditionalism"]
df_stacked =df[df.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Value'})
df_stacked["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in df_stacked["index"]]

In [None]:
lm = mixedlm("Value ~ persona_cat",  groups='Model', data=df_stacked)
fit = lm.fit()

In [None]:
table = fit.summary().tables[1]
table = table.iloc[:-2].astype("float")
significant = table[table["P>|z|"]<.05]
significant.sort_values("Coef.")

In [None]:
lm = mixedlm("Value ~ index", groups='Model', data=df_stacked)
fit = lm.fit()

In [None]:
table = fit.summary().tables[1]
table = table.iloc[:-1].astype("float")
significant = table[table["P>|z|"]<.05]
significant.sort_values("Coef.")

In [None]:
att_dfs["traditionalism"]["avg"] = att_dfs["traditionalism"].mean(1, numeric_only=True)

In [None]:
att_dfs["traditionalism"].sort_values("avg")

In [None]:
att_dfs["racist beliefs"].corr(numeric_only=True, method="kendall")

In [None]:
for group, df in att_dfs["racist beliefs"].groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group)
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = att_dfs["racist beliefs"].groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        cat_means["avg"] = cat_means.mean(1, numeric_only=True)
        print(cat_means.sort_values("avg"))
        print("=========================")

In [None]:
df = att_dfs["racist beliefs"]
df_stacked =df[df.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Value'})
df_stacked["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in df_stacked["index"]]

In [None]:
lm = mixedlm("Value ~ persona_cat",  groups='Model', data=df_stacked)
fit = lm.fit()

In [None]:
table = fit.summary().tables[1]
table = table.iloc[:-2].astype("float")
significant = table[table["P>|z|"]<.05]
significant.sort_values("Coef.")

In [None]:
lm = mixedlm("Value ~ index", groups='Model', data=df_stacked)
fit = lm.fit()

In [None]:
df.mean(0, numeric_only=True)

In [None]:
fit.summary()

In [None]:
table = fit.summary().tables[1]
table = table.iloc[:-1].astype("float")
significant = table[table["P>|z|"]<.05]
significant.sort_values("Coef.")

In [None]:
att_dfs["racist beliefs"]["avg"] = att_dfs["racist beliefs"].mean(1, numeric_only=True)

In [None]:
att_dfs["racist beliefs"].sort_values("avg")

In [None]:
att_dfs["harm of hate speech"].corr(numeric_only=True, method="kendall")

In [None]:
for group, df in att_dfs["harm of hate speech"].groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group)
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = att_dfs["harm of hate speech"].groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        cat_means["avg"] = cat_means.mean(1, numeric_only=True)
        print(cat_means.sort_values("avg"))
        print("=========================")

In [None]:
df = att_dfs["harm of hate speech"]
df_stacked =df[df.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Value'})
df_stacked["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in df_stacked["index"]]

In [None]:
att_dfs["harm of hate speech"]["avg"] = att_dfs["freedom of speech"].mean(1, numeric_only=True)

In [None]:
att_dfs["harm of hate speech"].sort_values("avg")

### Persona categories vs Control personas

In [None]:
for att, df in att_dfs.items():
    att_dfs[att]["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in att_dfs[att].index]

In [None]:
ratios_dfs = {}

In [None]:
std_dfs = []

In [None]:
std_df =  att_dfs["freedom of speech"].groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
ratios_dfs["freedom of speech"] = std_df_ratios
std_df_ratios

In [None]:
std_dfs.append(std_df)

In [None]:
std_df =  att_dfs["altruism"].groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
ratios_dfs["altruism"] = std_df_ratios
std_df_ratios

In [None]:
std_dfs.append(std_df)

In [None]:
std_df =  att_dfs["empathy"].groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
ratios_dfs["empathy"] = std_df_ratios
std_df_ratios

In [None]:
std_dfs.append(std_df)

In [None]:
std_df =  att_dfs["language purism"].groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
ratios_dfs["language purism"] = std_df_ratios
std_df_ratios

In [None]:
std_dfs.append(std_df)

In [None]:
std_df =  att_dfs["traditionalism"].groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
ratios_dfs["traditionalism"] = std_df_ratios
std_df_ratios

In [None]:
std_dfs.append(std_df)

In [None]:
std_df =  att_dfs["racist beliefs"].groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
ratios_dfs["racist beliefs"] = std_df_ratios
std_df_ratios

In [None]:
std_dfs.append(std_df)

In [None]:
std_df =  att_dfs["harm of hate speech"].groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
ratios_dfs["harm of hate speech"] = std_df_ratios
std_df_ratios

In [None]:
std_dfs.append(std_df)

In [None]:
att_stds =  pd.concat(std_dfs, axis=0).groupby("persona_cat").mean().rename(columns=rename_model)

In [None]:
fig, axs = plt.subplots(figsize=(16, 8), nrows=2, ncols=4,sharex=True, sharey=True)

for idx, att in enumerate(ratios_dfs.keys()):
    ax = axs[idx//4, idx%4]
    ax.set_title(att, weight='bold', size='medium', position=(0.5, 1.1),
                 horizontalalignment='center', verticalalignment='center')
    data = ratios_dfs[att].iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
    ax1 = sns.stripplot(data=data, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), ax=ax, log_scale=True)
    # ax.plot(theta, data.iloc[:,:-1].min(), color="green")
    # ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
    # ax.plot(theta, data.iloc[:,:-1].max(), color="red")
    # ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
    # add legend relative to top-left plot
    ax.get_legend().set_visible(False)
    ax.axhline(1, linestyle='--')
    ax.set_yscale('symlog')
    ax.set_xlabel("")
    ax.tick_params(labelrotation=45)
axs[1,3].axis("off")
handles, labels = ax1.get_legend_handles_labels()
fig.legend(handles, labels,  loc='upper left', bbox_to_anchor=(0.75, 0.45))
# axs[1,3].legend(handles, labels, loc=(0.9, .95),
#                       labelspacing=0.1, fontsize='small')
plt.show()

In [None]:
fig.savefig("../persona-biases-paper/media/attitude_std.pdf", bbox_inches="tight")

### Refusal rates

In [None]:
all_att_refusals = get_results_df("attitude_refusals.csv")

In [None]:
all_att_refusals = all_att_refusals.rename(columns=rename_model)

In [None]:
all_att_refusals = all_att_refusals[list(rename_model.values()) + ["persona_cat"]]

In [None]:
all_att_refusals

In [None]:
all_att_refusals.loc["a_black_person"].iloc[:-1] / all_att_refusals.loc["an_african_american_person"].iloc[:-1] 

In [None]:
all_att_refusals.loc[["a_gay_person", "a_homosexual_person"]]

In [None]:
all_att_refusals.loc["a_homosexual_person"].iloc[:-1] / all_att_refusals.loc["a_gay_person"].iloc[:-1] 

In [None]:
all_att_refusals.sort_values("Mixtral")

In [None]:
all_att_refusals_stacked = all_att_refusals.iloc[:,:-1].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Refused Fraction'})

In [None]:
all_att_refusals_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in all_att_refusals_stacked["index"]]

In [None]:
all_refusals_stacked.append(all_att_refusals_stacked)

In [None]:
no_control = all_att_refusals_stacked[~all_att_refusals_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  all_att_refusals_stacked[all_att_refusals_stacked["persona_cat"] == "control"]
only_empty = all_att_refusals_stacked[all_att_refusals_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Refused Fraction", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Refused Fraction", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Refused Fraction", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
all_att_refusals.corr(numeric_only=True, method="kendall")

In [None]:
all_att_refusals["avg"] = all_att_refusals.mean(1, numeric_only=True)

In [None]:
all_att_refusals.sort_values("avg")

In [None]:
all_att_refusals.idxmax()

In [None]:
all_att_refusals.idxmin()

In [None]:
pos_df = pd.DataFrame(index=all_att_refusals.index)
for col in all_att_refusals.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = all_att_refusals.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(all_att_refusals)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df.sort_values("avg")

### Persona categories vs control personas

In [None]:
std_df =  all_att_refusals.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
std_df

In [None]:
data = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=data, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
refusal_std_dfs.append(std_df)

## Toxicity

### Offensiveness

#### Inter-persona variability

In [None]:
all_off = get_results_df("off_scores.csv")

In [None]:
data = pd.read_csv("./data/annWithAttitudes/largeScale.csv")

In [None]:
data[~(data.isAAE | data.targetsBlackPeople | data.vulgar)]

In [None]:
len(np.unique(data.annId))

In [None]:
krippendorff.alpha(value_counts=data.groupby(["tweet", "off_avg"]).size().unstack(fill_value=0).to_numpy(), level_of_measurement="interval")

In [None]:
inter_alphas = {}
for model in all_off.columns[:-1]:
    inter_alphas[model] = krippendorff.alpha(reliability_data=np.stack(all_off[model].to_numpy()), level_of_measurement="interval")

In [None]:
pd.Series(inter_alphas).sort_values()

In [None]:
inter_alphas_no_control = {}
all_off_no_control = all_off[all_off.persona_cat != "control"]
for model in all_off_no_control.columns[:-1]:
    inter_alphas_no_control[model] = krippendorff.alpha(reliability_data=np.stack(all_off_no_control[model].to_numpy()), level_of_measurement="interval")

In [None]:
pd.Series(inter_alphas_no_control).sort_values()

In [None]:
all_off_means = all_off.iloc[:,:-1].map(np.mean)

In [None]:
all_off_means = all_off_means[[x for x in rename_model.values()]].copy()

In [None]:
all_off_means["persona_cat"] = all_off["persona_cat"]

In [None]:
all_off_means_stacked = all_off_means[all_off_means.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Avg. Off.'})

In [None]:
all_off_means_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in all_off_means_stacked["index"]]

In [None]:
no_control = all_off_means_stacked[~all_off_means_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  all_off_means_stacked[all_off_means_stacked["persona_cat"] == "control"]
only_empty = all_off_means_stacked[all_off_means_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Avg. Off.", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Avg. Off.", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Avg. Off.", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
all_off_means.boxplot(by="persona_cat", rot=90, figsize=(15,15))

In [None]:
inter_std = {}
for col in all_off.columns[:-1]:
    inter_std[col] = np.stack(all_off[col].to_numpy()).std(0).mean()
    

In [None]:
pd.Series(inter_std).sort_values()

In [None]:
all_off_means.corr(numeric_only=True, method="kendall")

In [None]:
all_off_means["avg"] = all_off_means.mean(1, numeric_only=True)

In [None]:
all_off_means.sort_values("avg")

In [None]:
all_off_means.idxmax()

In [None]:
all_off_means.idxmin()

In [None]:
pos_df = pd.DataFrame(index=all_off_means.index)
for col in all_off_means.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = all_off_means.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(all_off_means)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df.sort_values("avg")

### Persona categories vs control personas

In [None]:
std_df =  all_off_means.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
off_means_std = df

#### Agreement with human judgements

In [None]:
human_agg = data.drop_duplicates("tweet").copy()
ratings = []
for _, row in human_agg.iterrows():
    ratings.append(data[data.tweet == row["tweet"]]["off_avg"].mean())
human_agg.loc[:, "off_avg"] = ratings
ratings = []
for _, row in human_agg.iterrows():
    ratings.append(data[data.tweet == row["tweet"]]["racist"].mean())
human_agg.loc[:, "racist"] = ratings

In [None]:
off_aggs = {}
for model in all_off.columns[:-1]:
    off_aggs[model]= all_off[model].apply(lambda x: krippendorff.alpha(reliability_data=[x, human_agg["off_avg"]], level_of_measurement="interval"))

In [None]:
aggs_df = pd.DataFrame.from_dict(off_aggs)

In [None]:
aggs_df["persona_cat"] = all_off["persona_cat"]

In [None]:
aggs_df = aggs_df[list(rename_model.values())+["persona_cat"]]

In [None]:
off_aggs = aggs_df

In [None]:
aggs_df.sort_values("GPT-4")

In [None]:
off_aggs_stacked = aggs_df.iloc[:,:-1].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Agreement with humans'})

In [None]:
off_aggs_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in off_aggs_stacked["index"]]

In [None]:
no_control = off_aggs_stacked[~off_aggs_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  off_aggs_stacked[off_aggs_stacked["persona_cat"] == "control"]
only_empty = off_aggs_stacked[off_aggs_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Agreement with humans", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Agreement with humans", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Agreement with humans", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
aggs_df.corr(numeric_only=True, method="kendall")

In [None]:
aggs_df["avg"] = aggs_df.mean(1, numeric_only=True)

In [None]:
aggs_df.sort_values("avg")

In [None]:
aggs_df.idxmax()

In [None]:
aggs_df.idxmin()

In [None]:
pos_df = pd.DataFrame(index=aggs_df.index)
for col in aggs_df.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = aggs_df.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(aggs_df)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df.sort_values("avg")

### Persona categories vs control personas

In [None]:
std_df =  aggs_df.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), log_scale=True)
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
off_aggs_stds = std_df

In [None]:
off_aggs_std = df

#### Refusal rates

In [None]:
all_off_refusals = get_results_df("off_refusals.csv")

In [None]:
all_off_refusals = all_off_refusals.rename(columns=rename_model)

In [None]:
all_off_refusals = all_off_refusals[list(rename_model.values()) + ["persona_cat"]]

In [None]:
all_off_refusals_mean = all_off_refusals.iloc[:,:-1].map(np.mean)

In [None]:
all_off_refusals_mean.sort_values("GPT-4")

In [None]:
all_off_refusals_stacked = all_off_refusals_mean.iloc[:,:-1].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Refused Fraction'})

In [None]:
all_off_refusals_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in all_off_refusals_stacked["index"]]

In [None]:
all_refusals_stacked.append(all_off_refusals_stacked)

In [None]:
no_control = all_off_refusals_stacked[~all_off_refusals_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  all_off_refusals_stacked[all_off_refusals_stacked["persona_cat"] == "control"]
only_empty = all_off_refusals_stacked[all_off_refusals_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Refused Fraction", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Refused Fraction", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Refused Fraction", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
all_off_refusals_mean.corr(numeric_only=True, method="kendall")

In [None]:
all_off_refusals_mean["avg"] = all_off_refusals_mean.mean(1, numeric_only=True)

In [None]:
all_off_refusals_mean.sort_values("avg")

In [None]:
all_off_refusals_mean.idxmax()

In [None]:
all_off_refusals_mean.idxmin()

In [None]:
pos_df = pd.DataFrame(index=all_off_refusals_mean.index)
for col in all_off_refusals_mean.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = all_off_refusals_mean.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(all_off_refusals_mean)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df

### Persona categories vs control personas

In [None]:
all_off_refusals_mean["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in all_off_refusals_mean.index]

In [None]:
std_df

In [None]:
std_df =  all_off_refusals_mean.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
refusal_std_dfs.append(std_df)

### Racism

#### Inter-persona variability

In [None]:
all_rac = get_results_df("rac_scores.csv")

In [None]:
data = pd.read_csv("./data/annWithAttitudes/largeScale.csv")

In [None]:
krippendorff.alpha(value_counts=data.groupby(["tweet", "racist"]).size().unstack(fill_value=0).to_numpy(), level_of_measurement="interval")

In [None]:
inter_alphas = {}
for model in all_rac.columns[:-1]:
    inter_alphas[model] = krippendorff.alpha(reliability_data=np.stack(all_rac[model].to_numpy()), level_of_measurement="interval")

In [None]:
pd.Series(inter_alphas).sort_values()

In [None]:
inter_alphas_no_control = {}
all_rac_no_control = all_rac[all_rac.persona_cat != "control"]
for model in all_rac_no_control.columns[:-1]:
    inter_alphas_no_control[model] = krippendorff.alpha(reliability_data=np.stack(all_rac_no_control[model].to_numpy()), level_of_measurement="interval")

In [None]:
pd.Series(inter_alphas_no_control).sort_values()

In [None]:
all_rac_means = all_rac.iloc[:,:-1].map(np.mean)

In [None]:
all_rac_means = all_rac_means[[x for x in rename_model.values()]].copy()

In [None]:
all_rac_means["persona_cat"] = all_rac["persona_cat"]

In [None]:
all_rac_means_stacked = all_rac_means[all_rac_means.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Avg. rac.'})

In [None]:
all_rac_means_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in all_rac_means_stacked["index"]]

In [None]:
no_control = all_rac_means_stacked[~all_rac_means_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  all_rac_means_stacked[all_rac_means_stacked["persona_cat"] == "control"]
only_empty = all_rac_means_stacked[all_rac_means_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Avg. rac.", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
racset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Avg. rac.", color="black", marker="X", transform=trans+racset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Avg. rac.", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
all_rac_means.boxplot(by="persona_cat", rot=90, figsize=(15,15))

In [None]:
inter_std = {}
for col in all_rac.columns[:-1]:
    inter_std[col] = np.stack(all_rac[col].to_numpy()).std(0).mean()
    

In [None]:
pd.Series(inter_std).sort_values()

In [None]:
all_rac_means.corr(numeric_only=True, method="kendall")

In [None]:
all_rac_means["avg"] = all_rac_means.mean(1, numeric_only=True)

In [None]:
all_rac_means.sort_values("avg")

In [None]:
all_rac_means.idxmax()

In [None]:
all_rac_means.idxmin()

In [None]:
pos_df = pd.DataFrame(index=all_rac_means.index)
for col in all_rac_means.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = all_rac_means.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(all_rac_means)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df.sort_values("avg")

#### Persona cat vs control

In [None]:
std_df =  all_rac_means.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
rac_means_std = df

#### Agreement with human judgements

In [None]:
rac_aggs = {}
for model in all_rac.columns[:-1]:
    rac_aggs[model]= all_rac[model].apply(lambda x: krippendorff.alpha(reliability_data=[x, human_agg["racist"]], level_of_measurement="interval"))

In [None]:
aggs_df = pd.DataFrame.from_dict(rac_aggs)

In [None]:
aggs_df["persona_cat"] = all_rac["persona_cat"]

In [None]:
aggs_df = aggs_df[list(rename_model.values())+["persona_cat"]]

In [None]:
rac_aggs = aggs_df

In [None]:
aggs_df.sort_values("Gemma-2b-inst")

In [None]:
rac_aggs_stacked = aggs_df.iloc[:,:-1].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Agreement with humans'})

In [None]:
rac_aggs_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in rac_aggs_stacked["index"]]

In [None]:
no_control = rac_aggs_stacked[~rac_aggs_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  rac_aggs_stacked[rac_aggs_stacked["persona_cat"] == "control"]
only_empty = rac_aggs_stacked[rac_aggs_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Agreement with humans", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Agreement with humans", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Agreement with humans", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
aggs_df.corr(numeric_only=True, method="kendall")

In [None]:
aggs_df["avg"] = aggs_df.mean(1, numeric_only=True)

In [None]:
aggs_df.sort_values("avg")

In [None]:
aggs_df.idxmax()

In [None]:
aggs_df.idxmin()

In [None]:
pos_df = pd.DataFrame(index=aggs_df.index)
for col in aggs_df.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = aggs_df.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(aggs_df)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df.sort_values("avg")

#### Persona cat vs control

In [None]:
std_df =  aggs_df.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
rac_aggs_std = df

In [None]:
rac_aggs_stds = std_df

#### Refusal rates

In [None]:
all_rac_refusals = get_results_df("rac_refusals.csv")

In [None]:
all_rac_refusals = all_rac_refusals.rename(columns=rename_model)

In [None]:
all_rac_refusals = all_rac_refusals[list(rename_model.values()) + ["persona_cat"]]

In [None]:
all_rac_refusals_mean = all_rac_refusals.iloc[:,:-1].map(np.mean)

In [None]:
all_rac_refusals_mean.sort_values("GPT-4")

In [None]:
all_rac_refusals_stacked = all_rac_refusals_mean.iloc[:,:-1].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Refused Fraction'})

In [None]:
all_rac_refusals_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in all_rac_refusals_stacked["index"]]

In [None]:
all_refusals_stacked.append(all_rac_refusals_stacked)

In [None]:
no_control = all_rac_refusals_stacked[~all_rac_refusals_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  all_rac_refusals_stacked[all_rac_refusals_stacked["persona_cat"] == "control"]
only_empty = all_rac_refusals_stacked[all_rac_refusals_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Refused Fraction", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Refused Fraction", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Refused Fraction", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
all_rac_refusals_mean.corr(numeric_only=True, method="kendall")

In [None]:
all_rac_refusals_mean["avg"] = all_rac_refusals_mean.mean(1, numeric_only=True)

In [None]:
all_rac_refusals_mean.sort_values("avg")

In [None]:
all_rac_refusals_mean.idxmax()

In [None]:
all_rac_refusals_mean.idxmin()

In [None]:
pos_df = pd.DataFrame(index=all_rac_refusals_mean.index)
for col in all_rac_refusals_mean.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = all_rac_refusals_mean.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(all_rac_refusals_mean)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df.sort_values("avg")

#### Persona cat vs control

In [None]:
all_rac_refusals_mean["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in all_rac_refusals_mean.index]

In [None]:
std_df =  all_rac_refusals_mean.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
std_df

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
refusal_std_dfs.append(std_df)

### Offensive and racism figures

In [None]:
dfs = [all_off_means_stacked, all_rac_means_stacked, off_aggs_stacked, rac_aggs_stacked]

In [None]:
y_titles=[df.columns[2] for df in dfs]
x_titles = ["Avg. offensiveness", "Avg. racism", "Agreement with human (off.)", "Agreement with human (rac.)"]

In [None]:
plot_x_titles = ["Offensiveness", "Racism"]
plot_y_titles = ["Avg. rating", "Aggreement with humans"]

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(8, 8), sharex=True, sharey="row")
plt.subplots_adjust(wspace=.05, hspace=.05)
for idx, df in enumerate(dfs):
    no_control = df[~df["persona_cat"].isin(["control", "empty"])]
    only_control =  df[df["persona_cat"] == "control"]
    only_empty = df[df["persona_cat"] == "empty"]
    ax = axes[idx//2,idx%2]
    # ax, fix = plt.subplot(1, 1, figsize=(4,4))
    ax1 = sns.stripplot(data=no_control, x="Model", y=y_titles[idx], hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), ax=ax)
    offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
    trans = ax.transData
    ax2 = sns.stripplot(data=only_control, x="Model", y=y_titles[idx], color="black", marker="X", transform=trans+offset(10), ax=ax)
    ax3 = sns.stripplot(data=only_empty, x="Model", y=y_titles[idx], color="red", marker="*", size=10, transform=trans+offset(-10), ax=ax)
    control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
    control_label = "control"
    empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
    empty_label = "empty"
    ax.get_legend().set_visible(False)
    if idx//2 == 1: ax.set_xlabel(plot_x_titles[idx%2])
    ax.set_ylabel(plot_y_titles[idx//2]) if idx %2 == 0 else ax.set_ylabel("")
    # ax.set_ylabel("")
    ax.tick_params(axis='x', labelrotation=45)
handles, labels = ax1.get_legend_handles_labels()
handles = handles + [control_handle, empty_handle]
labels = labels + [control_label, empty_label]
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, +1.01),
          fancybox=True, shadow=True, ncol=4)

In [None]:
fig.savefig("../persona-biases-paper/media/toxicity.pdf",bbox_inches="tight")

In [None]:
dfs = [off_means_std, rac_means_std, off_aggs_std, rac_aggs_std]

In [None]:
y_titles=[df.columns[2] for df in dfs]

In [None]:
x_titles = ["Avg. offensiveness", "Avg. racism", "Agreement with human (off.)", "Agreement with human (rac.)"]

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(16, 4), sharex=True, sharey=True)
plt.subplots_adjust(wspace=.15, hspace=.1)
for idx, df in enumerate(dfs):
    ax = axes[idx]
    # ax, fix = plt.subplot(1, 1, figsize=(4,4))
    ax1 = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), ax=ax, log_scale=True)
    ax.get_legend().set_visible(False)
    ax.set_xlabel(x_titles[idx])
    ax.axhline(1, linestyle='--')
    ax.set_yscale('symlog')
    ax.tick_params(axis='x', labelrotation=45)
handles, labels = ax1.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, +1.05),
          fancybox=True, shadow=True, ncol=7)

In [None]:
fig.savefig("../persona-biases-paper/media/toxicity_stds.pdf",bbox_inches="tight")

### Annotators with attitudes replication

#### Correlation between attitudes (humans vs personas)

In [None]:
annotators = data.drop_duplicates("annId")

In [None]:
annotators

In [None]:
rename_attitudes = {
    "freeSpeech": "freedom of speech",
    "harmHateSpeech": "harm of hate speech",
    "racism": "racist beliefs",
    "traditionalism": "traditionalism",
    "lingPurism": "language purism",
    "empathy": "empathy",
    "altruism": "altruism"
}

In [None]:
annotators_attitudes = annotators[list(rename_attitudes.keys())].rename(columns=rename_attitudes)

In [None]:
human_corr = get_correlations(annotators_attitudes); human_corr

In [None]:
att_scores =  {model: pd.read_csv(results_path/model/"attitude_scores.csv", index_col=0) for model in models}
att_scores_control = {model: pd.read_csv(control_results/model/"attitude_scores.csv", index_col=0) for model in models}
att_scores = {model: pd.concat([att_scores[model], att_scores_control[model]], axis=0) for model in models}

In [None]:
corrs = {}
for model in att_scores.keys():
    corrs[model] = get_correlations(att_scores[model].iloc[:,:-1])

In [None]:
corrs["gpt-4-0125-preview"]

In [None]:
corrs["gpt-3.5-turbo-0125"]

In [None]:
corrs["Mixtral-8x7B-Instruct-v0.1"]

In [None]:
corrs["zephyr-7b-beta"]

In [None]:
corrs["gemma-7b-it"]

In [None]:
corrs["gemma-2b-it"]

In [None]:
corrs.keys()

In [None]:
dfs = [human_corr] + [corrs[model] for model in rename_model.keys()]

In [None]:
dfs = [x.map(lambda y: float(y.replace("*", ""))) for x in dfs]

In [None]:
x_titles = ["Human"] + list(rename_model.values())

In [None]:
shape = att_scores["Mistral-7B-Instruct-v0.2"].iloc[:,:-1].to_numpy().shape

In [None]:
initialize_seeds()
random_atts = np.random.uniform(1, 5, size=shape, )

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(16, 8), sharex=True, sharey=True)
plt.subplots_adjust(wspace=.1, hspace=.13)
cbar_ax = fig.add_axes([.91, .11, .01, .8])
for idx, df in enumerate(dfs):
    ax = axes[idx//4,idx%4]
    mask = np.triu(np.ones_like(df, dtype=bool))
    np.fill_diagonal(mask, False)
    sns.heatmap(df, vmin=-1, vmax=1, center=0, annot=True, ax = ax, mask=mask, cbar_ax=cbar_ax, cmap="vlag")
    ax.set_title(x_titles[idx])
    ax.set_ylabel("")
    ax.tick_params(axis='x', labelrotation=45)

In [None]:
fig.savefig("../persona-biases-paper/media/attitude_correlations.pdf",bbox_inches="tight")

In [None]:
def upper_triangular_to_vector(df, ternary=False):
    if ternary:
        df = df.map(lambda x: "0" if "*" not in x else x)
    df = df.map(lambda x: float(x.rstrip("*")))
    if ternary:
        df = df.map(lambda x: 1 if x >0 else (-1 if x < 0 else float(x)))
        return df.to_numpy()[np.triu_indices(len(df), k = 1)].reshape((-1))
    else:
        return df.to_numpy()[np.triu_indices(len(df), k = 1)].reshape((1,-1))

In [None]:
similarities = {}
for model, corr in corrs.items():
    similarities[model] = cosine_similarity(upper_triangular_to_vector(human_corr), upper_triangular_to_vector(corr))[0]

In [None]:
similarities = pd.DataFrame.from_dict(similarities).rename(columns=rename_model)

In [None]:
similarities = similarities[[x for x in rename_model.values()]]

In [None]:
random_corr = get_correlations(pd.DataFrame(random_atts))

In [None]:
random_sim =cosine_similarity(upper_triangular_to_vector(human_corr), upper_triangular_to_vector(random_corr))[0]

In [None]:
similarities

In [None]:
random_sim

In [None]:
sns.barplot(similarities)
plt.axhline(random_sim)
plt.xticks(rotation=45)

In [None]:
similarity_attitudes = similarities

In [None]:
random_attitudes = random_sim

In [None]:
# correlations = {}
# for model, corr in corrs.items():
#     correlations[model] = pearsonr(upper_triangular_to_vector(human_corr).reshape((-1)), upper_triangular_to_vector(corr).reshape((-1))).statistic

In [None]:
# correlations = pd.DataFrame.from_dict(correlations, orient="index").T.rename(columns=rename_model)

In [None]:
# sns.barplot(correlations)
# plt.xticks(rotation=45)

In [None]:
# similarities_ternary = {}
# for model, corr in corrs.items():
#     similarities_ternary[model] = accuracy_score(upper_triangular_to_vector(human_corr, ternary=True), upper_triangular_to_vector(corr, ternary=True))

In [None]:
# similarities_ternary = pd.DataFrame.from_dict(similarities_ternary, orient="index").T.rename(columns=rename_model)

In [None]:
# sns.barplot(similarities_ternary)
# plt.xticks(rotation=45)

#### Correlation between attitudes and annotations (humans vs personas)

In [None]:
data = data.rename(columns=rename_attitudes)

In [None]:
human_attitudeXannotation = pd.DataFrame(index=annotators["annId"])

In [None]:
human_attitudeXannotation["off_antiBlack"] =  data[data.targetsBlackPeople].groupby("annId")["off_avg"].mean()

In [None]:
human_attitudeXannotation["rac_antiBlack"] = data[data.targetsBlackPeople].groupby("annId")["racist"].mean()

In [None]:
human_attitudeXannotation["off_aae"] =  data[data.isAAE].groupby("annId")["off_avg"].mean()

In [None]:
human_attitudeXannotation["rac_aae"] = data[data.isAAE].groupby("annId")["racist"].mean()

In [None]:
human_attitudeXannotation["off_vulgar"] =  data[data.vulgar].groupby("annId")["off_avg"].mean()

In [None]:
human_attitudeXannotation["rac_vulgar"] = data[data.vulgar].groupby("annId")["racist"].mean()

In [None]:
human_attitudeXannotation = human_attitudeXannotation.merge(data.drop_duplicates("annId")[[x for x in rename_attitudes.values()] + ["annId"]], on="annId")

In [None]:
human_corrs = get_correlations(human_attitudeXannotation).iloc[7:,1:7]

In [None]:
human_corrs

In [None]:
merged_df = pd.concat([att_scores["gpt-4-0125-preview"], all_off["GPT-4"]], axis=1)

In [None]:
corrs = {}
for model in att_scores.keys():
    off_preds = np.vstack(all_off[rename_model[model]].to_numpy())
    rac_preds = np.vstack(all_rac[rename_model[model]].to_numpy())
    attitudes = att_scores[model].copy()
    del attitudes["persona_cat"]
    attitudes["off_antiBlack"] =  off_preds[:,data.drop_duplicates("tweet").targetsBlackPeople].mean(1)
    attitudes["off_aae"] =  off_preds[:,data.drop_duplicates("tweet").isAAE].mean(1)
    attitudes["off_vulgar"] =  off_preds[:,data.drop_duplicates("tweet").vulgar].mean(1)
    attitudes["rac_antiBlack"] =  rac_preds[:,data.drop_duplicates("tweet").targetsBlackPeople].mean(1)
    attitudes["rac_aae"] =  rac_preds[:,data.drop_duplicates("tweet").isAAE].mean(1)
    attitudes["rac_vulgar"] =  rac_preds[:,data.drop_duplicates("tweet").vulgar].mean(1)
    corrs[model] = get_correlations(attitudes).T.loc[human_corrs.index][human_corrs.columns]

In [None]:
random_atts = pd.DataFrame(random_atts, index=attitudes.index, columns=attitudes.columns[:7])

In [None]:
shape = rac_preds[:,data.drop_duplicates("tweet").isAAE].mean(1).shape

In [None]:
initialize_seeds()
random_atts["off_antiBlack"] =  np.random.uniform(1, 5, size=shape, )
random_atts["off_aae"] = np.random.uniform(1, 5, size=shape, )
random_atts["off_vulgar"] =  np.random.uniform(1, 5, size=shape, )
random_atts["rac_antiBlack"] = np.random.uniform(1, 5, size=shape, )
random_atts["rac_aae"] = np.random.uniform(1, 5, size=shape, )
random_atts["rac_vulgar"] =  np.random.uniform(1, 5, size=shape, )

In [None]:
corrs["gpt-4-0125-preview"]

In [None]:
corrs["gpt-3.5-turbo-0125"]

In [None]:
corrs["Mixtral-8x7B-Instruct-v0.1"]

In [None]:
corrs["zephyr-7b-beta"]

In [None]:
corrs["gemma-7b-it"]

In [None]:
corrs["gemma-2b-it"]

In [None]:
dfs = [human_corrs] + [corrs[model] for model in rename_model.keys()]

In [None]:
dfs = [x.map(lambda y: float(y.replace("*", ""))) for x in dfs]

In [None]:
x_titles = ["Human"] + list(rename_model.values())

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(16, 8), sharex=True, sharey=True)
plt.subplots_adjust(wspace=.1, hspace=.13)
cbar_ax = fig.add_axes([.91, .11, .01, .8])
for idx, df in enumerate(dfs):
    ax = axes[idx//4,idx%4]
    sns.heatmap(df, vmin=-1, vmax=1, center=0, annot=True, ax = ax, cbar_ax=cbar_ax, cmap="vlag")
    ax.set_title(x_titles[idx])
    ax.set_ylabel("")
    ax.tick_params(axis='x', labelrotation=45)

In [None]:
fig.savefig("../persona-biases-paper/media/attitudeXannotation_correlations.pdf",bbox_inches="tight")

In [None]:
def matrix_to_vector(df, ternary=False):
    if ternary:
        df = df.map(lambda x: "0" if "*" not in x else x)
    df = df.map(lambda x: float(x.rstrip("*")))
    if ternary:
        df = df.map(lambda x: 1 if x >0 else (-1 if x < 0 else float(x)))
        return df.to_numpy().flatten()
    else:
        return df.to_numpy().flatten()

In [None]:
similarities = {}
for model, corr in corrs.items():
    similarities[model] = cosine_similarity([matrix_to_vector(human_corrs)], [matrix_to_vector(corr)])[0]

In [None]:
similarities = pd.DataFrame.from_dict(similarities).rename(columns=rename_model)

In [None]:
similarities = similarities[[x for x in rename_model.values()]]

In [None]:
random_corr = get_correlations(random_atts).T.loc[human_corrs.index][human_corrs.columns]

In [None]:
random_sim = cosine_similarity([matrix_to_vector(human_corrs)], [matrix_to_vector(random_corr)])[0]

In [None]:
similarities

In [None]:
sns.barplot(similarities)
plt.axhline(random_sim)
plt.xticks(rotation=45)

In [None]:
similarity_attitudesXannotation = similarities

In [None]:
random_attitudesXannotation = random_sim

In [None]:
dfs = [similarity_attitudes, similarity_attitudesXannotation]

In [None]:
randoms = [random_attitudes, random_attitudesXannotation]

In [None]:
x_titles = ["Corr. between attitudes", "Corr. between attitudes and annotations"]

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=True)
plt.subplots_adjust(wspace=.1, hspace=.13)
for idx, df in enumerate(dfs):
    ax = axes[idx]
    sns.barplot(df, ax=ax)
    ax.axhline(randoms[idx], color="black")
    ax.set_title(x_titles[idx])
    ax.set_ylabel("")
    ax.tick_params(axis='x', labelrotation=45)

In [None]:
fig.savefig("../persona-biases-paper/media/modelHumanSimilarity.pdf",bbox_inches="tight")

In [None]:
# correlations = {}
# for model, corr in corrs.items():
#     correlations[model] = pearsonr(matrix_to_vector(human_corrs), matrix_to_vector(corr)).statistic

In [None]:
# correlations = pd.DataFrame.from_dict(correlations, orient="index").T.rename(columns=rename_model)

In [None]:
# sns.barplot(correlations)
# plt.xticks(rotation=45)

In [None]:
# similarities_ternary = {}
# for model, corr in corrs.items():
#     similarities_ternary[model] = accuracy_score(matrix_to_vector(human_corrs, ternary=True), matrix_to_vector(corr, ternary=True))

In [None]:
# similarities_ternary = pd.DataFrame.from_dict(similarities_ternary, orient="index").T.rename(columns=rename_model)

In [None]:
# sns.barplot(similarities_ternary)
# plt.xticks(rotation=45)

# TruthfulQA

## Persona variability

In [None]:
hits = get_results_df("truthfulqa_hits.csv")

In [None]:
hits_means = hits.iloc[:,:-1].map(np.mean)

In [None]:
hits_means = hits_means[[x for x in rename_model.values()]].copy()

In [None]:
hits_means["persona_cat"] = hits["persona_cat"]

In [None]:
hits_means_stacked = hits_means[hits_means.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Accuracy'})

In [None]:
hits_means_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in hits_means_stacked["index"]]

In [None]:
all_hits_stacked.append(hits_means_stacked)

In [None]:
no_control = hits_means_stacked[~hits_means_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  hits_means_stacked[hits_means_stacked["persona_cat"] == "control"]
only_empty = hits_means_stacked[hits_means_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Accuracy", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
racset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Accuracy", color="black", marker="X", transform=trans+racset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Accuracy", color="red", marker="*", size=10, transform=trans+racset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
hits_means.boxplot(by="persona_cat", rot=90, figsize=(15,15))

In [None]:
inter_std = {}
for col in hits.columns[:-1]:
    inter_std[col] = np.stack(hits[col].to_numpy()).std(0).mean()
    

In [None]:
pd.Series(inter_std).sort_values()

In [None]:
hits_means.corr(numeric_only=True, method="kendall")

In [None]:
hits_means["avg"] = hits_means.mean(1, numeric_only=True)

In [None]:
hits_means.sort_values("avg")

In [None]:
hits_means.idxmax()

In [None]:
hits_means.idxmin()

In [None]:
pos_df = pd.DataFrame(index=hits_means.index)
for col in hits_means.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = hits_means.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(hits_means)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df.loc["a_person_of_atheism_belief"]

In [None]:
pos_df.loc["a_asexual_person"]

In [None]:
pos_df.loc["a_statistician"]

In [None]:
pos_df.loc["Adolf_Hitler"]

In [None]:
pos_df.loc["Jorg_Haider"]

In [None]:
pos_df.loc["a_person_with_fascism_ideology"]

#### Persona cat vs control

In [None]:
std_df =  hits_means.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
performance_std_dfs.append(std_df)

## Refusal

In [None]:
truthfulqa_refusals = get_results_df("truthfulqa_refusals.csv")

In [None]:
truthfulqa_refusals = truthfulqa_refusals.rename(columns=rename_model)

In [None]:
truthfulqa_refusals = truthfulqa_refusals[list(rename_model.values()) + ["persona_cat"]]

In [None]:
truthfulqa_refusals_mean = truthfulqa_refusals.iloc[:,:-1].map(np.mean)

In [None]:
truthfulqa_refusals_mean.sort_values("Gemma-2b-inst")

In [None]:
truthfulqa_refusals_stacked = truthfulqa_refusals_mean.stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Refused Fraction'})

In [None]:
truthfulqa_refusals_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in truthfulqa_refusals_stacked["index"]]

In [None]:
all_refusals_stacked.append(truthfulqa_refusals_stacked)

In [None]:
no_control = truthfulqa_refusals_stacked[~truthfulqa_refusals_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  truthfulqa_refusals_stacked[truthfulqa_refusals_stacked["persona_cat"] == "control"]
only_empty = truthfulqa_refusals_stacked[truthfulqa_refusals_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Refused Fraction", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Refused Fraction", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Refused Fraction", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
truthfulqa_refusals_mean.corr(numeric_only=True, method="kendall")

In [None]:
truthfulqa_refusals_mean["avg"] = truthfulqa_refusals_mean.mean(1, numeric_only=True)

In [None]:
truthfulqa_refusals_mean.sort_values("avg")

In [None]:
truthfulqa_refusals_mean.idxmax()

In [None]:
truthfulqa_refusals_mean.idxmin()

In [None]:
pos_df = pd.DataFrame(index=truthfulqa_refusals_mean.index)
for col in truthfulqa_refusals_mean.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = truthfulqa_refusals_mean.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(truthfulqa_refusals_mean)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df.sort_values("avg")

#### Persona cat vs control

In [None]:
truthfulqa_refusals_mean["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in truthfulqa_refusals_mean.index]

In [None]:
std_df =  truthfulqa_refusals_mean.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
std_df

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), log_scale=True)
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
refusal_std_dfs.append(std_df)

# MMLU

## Persona variability

In [None]:
hits = get_results_df("mmlu_hits.csv")

In [None]:
full_data =  load_dataset("cais/mmlu", "all")["test"]

In [None]:
sampled_ids = json.load(open("./data/mmlu/sampled_ids.json", "r"))
sampled_data = full_data.select(sampled_ids) 

In [None]:
def mmlu_score(hits):
    data = full_data if len(hits) == len(full_data) else sampled_data
    temp_df = pd.DataFrame.from_dict({"hits": hits, "subject": data["subject"]})
    return temp_df.groupby("subject")["hits"].mean().mean()

In [None]:
def mmlu_diff(hits):
    data = full_data if len(hits) == len(full_data) else sampled_data
    temp_df = pd.DataFrame.from_dict({"hits": hits, "subject": data["subject"]})
    return temp_df.groupby("subject")["hits"].mean().max() - temp_df.groupby("subject")["hits"].mean().min()

In [None]:
def mmlu_oracle(hits):
    data = full_data if len(hits) == len(full_data) else sampled_data
    temp_df = pd.DataFrame.from_dict({"hits": hits, "subject": data["subject"]})
    return temp_df.groupby("subject")["hits"].mean().max() - temp_df.groupby("subject")["hits"].mean().min()

### Performance

In [None]:
hits_means = hits.iloc[:,:-1].map(mmlu_score)

In [None]:
hits_means = hits_means[[x for x in rename_model.values()]].copy()

In [None]:
hits_means["persona_cat"] = hits["persona_cat"]

In [None]:
hits_means_stacked = hits_means[hits_means.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Accuracy'})

In [None]:
hits_means_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in hits_means_stacked["index"]]

In [None]:
all_hits_stacked.append(hits_means_stacked)

In [None]:
no_control = hits_means_stacked[~hits_means_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  hits_means_stacked[hits_means_stacked["persona_cat"] == "control"]
only_empty = hits_means_stacked[hits_means_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Accuracy", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
racset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Accuracy", color="black", marker="X", transform=trans+racset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Accuracy", color="red", marker="*", size=10, transform=trans+racset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
hits_means.boxplot(by="persona_cat", rot=90, figsize=(15,15))

In [None]:
inter_std = {}
for col in hits.columns[:-1]:
    inter_std[col] = np.stack(hits[col].to_numpy()).std(0).mean()
    

In [None]:
pd.Series(inter_std).sort_values()

In [None]:
hits_means.corr(numeric_only=True, method="kendall")

In [None]:
hits_means["avg"] = hits_means.mean(1, numeric_only=True)

In [None]:
hits_means.sort_values("avg")

In [None]:
hits_means.idxmax()

In [None]:
hits_means.idxmin()

In [None]:
pos_df = pd.DataFrame(index=hits_means.index)
for col in hits_means.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = hits_means.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(hits_means)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df.loc["empty"]

In [None]:
pos_df.loc["a_person_with_fascism_ideology"]

#### Persona cat vs control

In [None]:
std_df =  hits_means.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
performance_std_dfs.append(std_df)

### Performance discrepancy (inter-subject)

In [None]:
hits_diffs = hits.iloc[:,:-1].map(mmlu_diff)

In [None]:
hits_diffs = hits_diffs[[x for x in rename_model.values()]].copy()

In [None]:
hits_diffs["persona_cat"] = hits["persona_cat"]

In [None]:
hits_diffs.sort_values("GPT-3.5")

In [None]:
hits_diffs_stacked = hits_diffs[hits_diffs.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Diff. in Accuracy'})

In [None]:
hits_diffs_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in hits_diffs_stacked["index"]]

In [None]:
no_control = hits_diffs_stacked[~hits_diffs_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  hits_diffs_stacked[hits_diffs_stacked["persona_cat"] == "control"]
only_empty = hits_diffs_stacked[hits_diffs_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Diff. in Accuracy", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
racset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Diff. in Accuracy", color="black", marker="X", transform=trans+racset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Diff. in Accuracy", color="red", marker="*", size=10, transform=trans+racset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
hits_diffs.boxplot(by="persona_cat", rot=90, figsize=(15,15))

In [None]:
hits_diffs.corr(numeric_only=True, method="kendall")

#### Persona cat vs control

In [None]:
std_df =  hits_diffs.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

## Refusal

In [None]:
mmlu_refusals = get_results_df("mmlu_refusals.csv")

In [None]:
mmlu_refusals = mmlu_refusals.rename(columns=rename_model)

In [None]:
mmlu_refusals = mmlu_refusals[list(rename_model.values()) + ["persona_cat"]]

In [None]:
mmlu_refusals_mean = mmlu_refusals.iloc[:,:-1].map(np.mean)

In [None]:
mmlu_refusals_mean.sort_values("Gemma-7b-inst")

In [None]:
mmlu_refusals_stacked = mmlu_refusals_mean.stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Refused Fraction'})

In [None]:
mmlu_refusals_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in mmlu_refusals_stacked["index"]]

In [None]:
all_refusals_stacked.append(mmlu_refusals_stacked)

In [None]:
no_control = mmlu_refusals_stacked[~mmlu_refusals_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  mmlu_refusals_stacked[mmlu_refusals_stacked["persona_cat"] == "control"]
only_empty = mmlu_refusals_stacked[mmlu_refusals_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Refused Fraction", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Refused Fraction", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Refused Fraction", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
mmlu_refusals_mean.corr(numeric_only=True, method="kendall")

In [None]:
mmlu_refusals_mean["avg"] = mmlu_refusals_mean.mean(1, numeric_only=True)

In [None]:
mmlu_refusals_mean.sort_values("avg")

In [None]:
mmlu_refusals_mean.idxmax()

In [None]:
mmlu_refusals_mean.idxmin()

In [None]:
pos_df = pd.DataFrame(index=mmlu_refusals_mean.index)
for col in mmlu_refusals_mean.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = mmlu_refusals_mean.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(mmlu_refusals_mean)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df.sort_values("avg")

#### Persona cat vs control

In [None]:
mmlu_refusals_mean["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in mmlu_refusals_mean.index]

In [None]:
std_df =  mmlu_refusals_mean.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), log_scale=True)
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
refusal_std_dfs.append(std_df)

# BBQ

## Persona variability

In [None]:
hits = get_results_df("bbq_hits.csv")

In [None]:
biased = get_results_df("bbq_biased.csv")

In [None]:
unknown = get_results_df("bbq_unknown.csv")

In [None]:
subsets = [
        "Age",
        "Disability_status",
        "Gender_identity",
        "Nationality",
        "Physical_appearance",
        "Race_ethnicity",
        "Race_x_gender",
        "Race_x_SES",
        "Religion",
        "SES",
        "Sexual_orientation",
    ]
BBQ  = {subset: load_dataset("heegyu/bbq", subset)["test"] for subset in subsets}

In [None]:
sampled_ids = json.load(open("./data/bbq/sampled_ids.json", "r"))
sampled_data = [data.select(sampled_ids[subset]) for subset, data in BBQ.items()]
full_data = [data for data in BBQ.values()]

In [None]:
full_data = concatenate_datasets(full_data)
sampled_data = concatenate_datasets(sampled_data)

In [None]:
sampled_data[5577]

In [None]:
full_data = full_data.to_pandas()
sampled_data = sampled_data.to_pandas()

In [None]:
bbq_suppl_df = pd.read_csv("./data/bbq/additional_metadata.csv")

In [None]:
full_data = pd.merge(left=full_data, right=bbq_suppl_df[["category", "example_id", "target_loc", "Relevant_social_values"]].drop_duplicates(), on=["category", "example_id"], how="inner")
sampled_data = pd.merge(left=sampled_data, right=bbq_suppl_df[["category", "example_id", "target_loc", "Relevant_social_values"]].drop_duplicates(), on=["category", "example_id"], how="inner")

In [None]:
full_data["group"] = [", ".join(x["stereotyped_groups"]) for x in full_data["additional_metadata"].tolist()]
sampled_data["group"] = [", ".join(x["stereotyped_groups"]) for x in sampled_data["additional_metadata"].tolist()]

In [None]:
hits = (hits.map(lambda x: [x]) + biased.map(lambda x: [x]) + unknown.map(lambda x: [x]))

In [None]:
def bbq_score(hits, score="acc", grouping="category"):
    hits, biased, unknown = [np.array(x) for x in hits]
    data = full_data if len(hits) == len(full_data) else sampled_data
    temp_df = pd.DataFrame.from_dict({"hits": hits, "category": data["category"], grouping: data[grouping], "context_condition": data["context_condition"]})
    biases = {}
    if score == "acc":
        return temp_df.groupby(grouping)["hits"].mean().mean()
    if score == "bias":
        temp_df_dropped = temp_df.loc[data.dropna(subset="target_loc").index].reset_index()
        del temp_df_dropped["context_condition"]
        for context, df in  data.dropna(subset="target_loc").reset_index().groupby("context_condition"):
            biases_cat = []
            accuracies = temp_df_dropped.loc[df.index].groupby(grouping)["hits"].mean()
            for category, cat_data in df.groupby(grouping):
                bias = 2*(biased[cat_data.index.tolist()].sum() / (1 - unknown[cat_data.index.tolist()]).sum() ) -1
                accuracy = accuracies.loc[category]
                if context == "disambig":
                    biases_cat.append(bias)
                else:
                    biases_cat.append(0 if accuracy == 1 else (1- accuracy) *bias)
            biases[f"bias_{context}"] = pd.DataFrame(biases_cat).mean()
    return np.mean([x.values for x in biases.values()])

In [None]:
def bbq_diff(hits, score="acc", grouping="category"):
    hits, biased, unknown = [np.array(x) for x in hits]
    data = full_data if len(hits) == len(full_data) else sampled_data
    temp_df = pd.DataFrame.from_dict({"hits": hits, "category": data["category"], grouping: data[grouping], "context_condition": data["context_condition"]})
    biases = {}
    if score == "acc":
        return temp_df.groupby(grouping)["hits"].mean().max() - temp_df.groupby(grouping)["hits"].mean().min()
    if score == "bias":
        temp_df_dropped = temp_df.loc[data.dropna(subset="target_loc").index].reset_index()
        del temp_df_dropped["context_condition"]
        for context, df in  data.dropna(subset="target_loc").reset_index().groupby("context_condition"):
            biases_cat = []
            accuracies = temp_df_dropped.loc[df.index].groupby(grouping)["hits"].mean()
            for category, cat_data in df.groupby(grouping):
                bias = 2*(biased[cat_data.index.tolist()].sum() / (1 - unknown[cat_data.index.tolist()]).sum() ) -1
                accuracy = accuracies.loc[category]
                if context == "disambig":
                    biases_cat.append(bias)
                else:
                    biases_cat.append(0 if accuracy == 1 else (1- accuracy) *bias)
            biases[f"bias_{context}"] = biases_cat

    biases = np.vstack([x for x in biases.values()]).mean(0)
    return biases.max() -  biases.min()

### Performance

#### Accuracies

In [None]:
hits_means = hits.iloc[:,:-1].map(bbq_score)

In [None]:
hits_means = hits_means[[x for x in rename_model.values()]].copy()

In [None]:
hits_means["persona_cat"] = hits["persona_cat"]

In [None]:
hits_means.sort_values("Gemma-7b-inst")

In [None]:
hits_means_stacked = hits_means[hits_means.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Accuracy'})

In [None]:
hits_means_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in hits_means_stacked["index"]]

In [None]:
all_hits_stacked.append(hits_means_stacked)

In [None]:
no_control = hits_means_stacked[~hits_means_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  hits_means_stacked[hits_means_stacked["persona_cat"] == "control"]
only_empty = hits_means_stacked[hits_means_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Accuracy", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
racset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Accuracy", color="black", marker="X", transform=trans+racset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Accuracy", color="red", marker="*", size=10, transform=trans+racset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
hits_means["persona_cat"] = hits_means["persona_cat"].apply(lambda x: x[0])

In [None]:
hits_means.boxplot(by="persona_cat", rot=90, figsize=(15,15))

In [None]:
hits_means.corr(numeric_only=True, method="kendall")

In [None]:
hits_means["avg"] = hits_means.mean(1, numeric_only=True)

In [None]:
hits_means.sort_values("avg")

In [None]:
hits_means.idxmax()

In [None]:
hits_means.idxmin()

In [None]:
pos_df = pd.DataFrame(index=hits_means.index)
for col in hits_means.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = hits_means.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(hits_means)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df.loc["a_nonbinary_person"]

In [None]:
pos_df.loc["a_person_with_fascism_ideology"]

#### Persona cat vs control

In [None]:
hits_means

In [None]:
std_df =  hits_means.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
performance_std_dfs.append(std_df)

#### Performance discrepancy (acc, inter-category)

In [None]:
hits_diffs = hits.iloc[:,:-1].map(bbq_diff)

In [None]:
hits_diffs = hits_diffs[[x for x in rename_model.values()]].copy()

In [None]:
hits_diffs["persona_cat"] = hits["persona_cat"]

In [None]:
hits_diffs.sort_values("GPT-3.5")

In [None]:
hits_diffs_stacked = hits_diffs[hits_diffs.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Diff. in Accuracy'})

In [None]:
hits_diffs_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in hits_diffs_stacked["index"]]

In [None]:
no_control = hits_diffs_stacked[~hits_diffs_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  hits_diffs_stacked[hits_diffs_stacked["persona_cat"] == "control"]
only_empty = hits_diffs_stacked[hits_diffs_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Diff. in Accuracy", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
racset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Diff. in Accuracy", color="black", marker="X", transform=trans+racset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Diff. in Accuracy", color="red", marker="*", size=10, transform=trans+racset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
hits_diffs.corr(numeric_only=True, method="kendall")

#### Performance discrepancy (acc, inter-target_group)

In [None]:
hits_diffs = hits.iloc[:,:-1].map(lambda x: bbq_diff(x, grouping="group"))

In [None]:
hits_diffs = hits_diffs[[x for x in rename_model.values()]].copy()

In [None]:
hits_diffs["persona_cat"] = hits["persona_cat"]

In [None]:
hits_diffs.sort_values("GPT-3.5")

In [None]:
hits_diffs_stacked = hits_diffs[hits_diffs.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Diff. in Accuracy'})

In [None]:
hits_diffs_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in hits_diffs_stacked["index"]]

In [None]:
no_control = hits_diffs_stacked[~hits_diffs_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  hits_diffs_stacked[hits_diffs_stacked["persona_cat"] == "control"]
only_empty = hits_diffs_stacked[hits_diffs_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Diff. in Accuracy", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
racset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Diff. in Accuracy", color="black", marker="X", transform=trans+racset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Diff. in Accuracy", color="red", marker="*", size=10, transform=trans+racset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
hits_diffs.corr(numeric_only=True, method="kendall")

#### Bias

In [None]:
tqdm.pandas()

In [None]:
bias_means = hits.iloc[:,:-1].progress_applymap(lambda x: bbq_score(x, score="bias"))

In [None]:
bias_means = bias_means[[x for x in rename_model.values()]].copy()

In [None]:
bias_means["persona_cat"] = [x[0] for x in hits["persona_cat"]]

In [None]:
bias_means.sort_values("GPT-4")

In [None]:
bias_means_stacked = bias_means[bias_means.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Bias'})

In [None]:
bias_means_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in bias_means_stacked["index"]]

In [None]:
no_control = bias_means_stacked[~bias_means_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  bias_means_stacked[bias_means_stacked["persona_cat"] == "control"]
only_empty = bias_means_stacked[bias_means_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Bias", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
racset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Bias", color="black", marker="X", transform=trans+racset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Bias", color="red", marker="*", size=10, transform=trans+racset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
(bias_means.max(numeric_only=True) - bias_means.min(numeric_only=True)).sort_values()

In [None]:
bias_means.boxplot(by="persona_cat", rot=90, figsize=(15,15))

In [None]:
bias_means.corr(numeric_only=True, method="kendall")

In [None]:
bias_means["avg"] = bias_means.mean(1, numeric_only=True)

In [None]:
bias_means.sort_values("avg")

In [None]:
bias_means.idxmax()

In [None]:
bias_means.idxmin()

In [None]:
pos_df = pd.DataFrame(index=bias_means.index)
for col in bias_means.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = bias_means.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(bias_means)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df.loc["a_nonbinary_person"]

In [None]:
pos_df.loc["a_person_with_conservatism_ideology"]

#### Persona cat vs control

In [None]:
std_df =  hits_means.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
bias_std = df

#### Performance discrepancy (bias, inter-category)

In [None]:
hits_diffs = hits.iloc[:,:-1].progress_applymap(lambda x: bbq_diff(x, score="bias"))

In [None]:
hits_diffs = hits_diffs[[x for x in rename_model.values()]].copy()

In [None]:
hits_diffs["persona_cat"] = [x[0] for x in hits["persona_cat"]]

In [None]:
hits_diffs.sort_values("GPT-4")

In [None]:
hits_diffs_stacked = hits_diffs[hits_diffs.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Diff. in Bias'})

In [None]:
hits_diffs_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in hits_diffs_stacked["index"]]

In [None]:
no_control = hits_diffs_stacked[~hits_diffs_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  hits_diffs_stacked[hits_diffs_stacked["persona_cat"] == "control"]
only_empty = hits_diffs_stacked[hits_diffs_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Diff. in Bias", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
racset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Diff. in Bias", color="black", marker="X", transform=trans+racset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Diff. in Bias", color="red", marker="*", size=10, transform=trans+racset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
hits_diffs.boxplot(by="persona_cat", rot=90, figsize=(15,15))

In [None]:
hits_diffs.corr(numeric_only=True, method="kendall")

#### Performance discrepancy (bias, inter-target_group)

In [None]:
hits_diffs = hits.iloc[:,:-1].progress_applymap(lambda x: bbq_diff(x, score="bias", grouping="group"))

In [None]:
hits_diffs = hits_diffs[[x for x in rename_model.values()]].copy()

In [None]:
hits_diffs["persona_cat"] = [x[0] for x in hits["persona_cat"]]

In [None]:
pd.set_option('display.max_rows', 200)
hits_diffs.sort_values("GPT-4")

In [None]:
hits_diffs_stacked = hits_diffs[hits_diffs.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Diff. in Bias'})

In [None]:
hits_diffs_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in hits_diffs_stacked["index"]]

In [None]:
no_control = hits_diffs_stacked[~hits_diffs_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  hits_diffs_stacked[hits_diffs_stacked["persona_cat"] == "control"]
only_empty = hits_diffs_stacked[hits_diffs_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Diff. in Bias", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
racset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Diff. in Bias", color="black", marker="X", transform=trans+racset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Diff. in Bias", color="red", marker="*", size=10, transform=trans+racset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
hits_diffs.boxplot(by="persona_cat", rot=90, figsize=(15,15))

In [None]:
hits_diffs.corr(numeric_only=True, method="kendall")

## Refusal

In [None]:
bbq_refusals = get_results_df("bbq_refusals.csv")

In [None]:
bbq_refusals = bbq_refusals.rename(columns=rename_model)

In [None]:
bbq_refusals = bbq_refusals[list(rename_model.values()) + ["persona_cat"]]

In [None]:
bbq_refusals_mean = bbq_refusals.iloc[:,:-1].map(np.mean)

In [None]:
bbq_refusals_mean.sort_values("Gemma-7b-inst")

In [None]:
bbq_refusals_stacked = bbq_refusals_mean.stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Refused Fraction'})

In [None]:
bbq_refusals_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in bbq_refusals_stacked["index"]]

In [None]:
all_refusals_stacked.append(bbq_refusals_stacked)

In [None]:
no_control = bbq_refusals_stacked[~bbq_refusals_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  bbq_refusals_stacked[bbq_refusals_stacked["persona_cat"] == "control"]
only_empty = bbq_refusals_stacked[bbq_refusals_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Refused Fraction", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Refused Fraction", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Refused Fraction", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
bbq_refusals_mean.corr(numeric_only=True, method="kendall")

In [None]:
bbq_refusals_mean["avg"] = bbq_refusals_mean.mean(1, numeric_only=True)

In [None]:
bbq_refusals_mean.sort_values("avg")

In [None]:
bbq_refusals_mean.idxmax()

In [None]:
bbq_refusals_mean.idxmin()

In [None]:
pos_df = pd.DataFrame(index=bbq_refusals_mean.index)
for col in bbq_refusals_mean.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = bbq_refusals_mean.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(bbq_refusals_mean)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df.sort_values("avg")

#### Persona cat vs control

In [None]:
bbq_refusals_mean["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in bbq_refusals_mean.index]

In [None]:
std_df =  bbq_refusals_mean.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
std_df

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
refusal_std_dfs.append(std_df)

## Unknown rates

In [None]:
unknown = unknown.rename(columns=rename_model)

In [None]:
unknown = unknown[list(rename_model.values()) + ["persona_cat"]]

In [None]:
unknown_mean = unknown.iloc[:,:-1].map(np.mean)

In [None]:
unknown_mean.sort_values("GPT-3.5")

In [None]:
unknown_stacked = unknown_mean.stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Unknown Fraction'})

In [None]:
unknown_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in unknown_stacked["index"]]

In [None]:
no_control = unknown_stacked[~unknown_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  unknown_stacked[unknown_stacked["persona_cat"] == "control"]
only_empty = unknown_stacked[unknown_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Unknown Fraction", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Unknown Fraction", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Unknown Fraction", color="red", marker="*", size=10, transform=trans+racset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
unknown_mean.corr(numeric_only=True, method="kendall")

In [None]:
(unknown_mean.max(numeric_only=True) - unknown_mean.min(numeric_only=True)).sort_values()

In [None]:
unknown_mean["avg"] = unknown_mean.mean(1, numeric_only=True)

In [None]:
unknown_mean.sort_values("avg")

In [None]:
unknown_mean.idxmax()

In [None]:
unknown_mean.idxmin()

In [None]:
pos_df = pd.DataFrame(index=unknown_mean.index)
for col in unknown_mean.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = unknown_mean.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(unknown_mean)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df.sort_values("avg")

#### Persona cat vs control

In [None]:
unknown_mean["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in unknown_mean.index]

In [None]:
std_df =  hits_means.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

In [None]:
unknown_std = df

### Aggregate figures

#### Bias and unknown rates

In [None]:
dfs = [bias_means_stacked, unknown_stacked]

In [None]:
y_titles=[df.columns[2] for df in dfs]

In [None]:
x_titles = ["Bias", "Unknown frequency"]

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(8, 4), sharex=True)
plt.subplots_adjust(wspace=.15, hspace=.1)
for idx, df in enumerate(dfs):
    no_control = df[~df["persona_cat"].isin(["control", "empty"])]
    only_control =  df[df["persona_cat"] == "control"]
    only_empty = df[df["persona_cat"] == "empty"]
    ax = axes[idx]
    # ax, fix = plt.subplot(1, 1, figsize=(4,4))
    ax1 = sns.stripplot(data=no_control, x="Model", y=y_titles[idx], hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), ax=ax)
    offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
    trans = ax.transData
    ax2 = sns.stripplot(data=only_control, x="Model", y=y_titles[idx], color="black", marker="X", transform=trans+offset(10), ax=ax)
    ax3 = sns.stripplot(data=only_empty, x="Model", y=y_titles[idx], color="red", marker="*", size=10, transform=trans+offset(-10), ax=ax)
    control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
    control_label = "control"
    empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
    empty_label = "empty"
    if idx==1: ax.axhline(.5, linestyle='--')
    ax.get_legend().set_visible(False)
    ax.set_xlabel(x_titles[idx])
    ax.set_ylabel("")
    ax.tick_params(axis='x', labelrotation=45)
handles, labels = ax1.get_legend_handles_labels()
handles = handles + [control_handle, empty_handle]
labels = labels + [control_label, empty_label]
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, +1.15),
          fancybox=True, shadow=True, ncol=4)

In [None]:
fig.savefig("../persona-biases-paper/media/bbq.pdf",bbox_inches="tight")

#### Bias and unknown rates stds

In [None]:
dfs = [bias_std, unknown_std]

In [None]:
y_titles=[df.columns[2] for df in dfs]

In [None]:
x_titles = ["Bias", "Unknown frequency"]

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(8, 4), sharex=True, sharey=True)
plt.subplots_adjust(wspace=.15, hspace=.1)
for idx, df in enumerate(dfs):
    ax = axes[idx]
    ax1 = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"),ax=ax, log_scale=True)
    ax.axhline(1, linestyle='--')
    a.set_yscale('symlog')
    ax.set_xlabel(x_titles[idx])
    ax.get_legend().set_visible(False)
    ax.tick_params(axis='x', labelrotation=45)
handles, labels = ax1.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, +1.1),
          fancybox=True, shadow=True, ncol=4)

In [None]:
fig.savefig("../persona-biases-paper/media/bbq_stds.pdf",bbox_inches="tight")

# Aggregate analysis

In [None]:
all_hits = get_results_df("all_hits.csv")

In [None]:
all_scores = get_results_df("all_scores.csv")

In [None]:
all_scores_avg = all_scores.iloc[:,:-1].map(lambda x: float(x[3]))

In [None]:
all_scores_avg.max() - all_scores_avg.min()

In [None]:
all_scores_avg = all_scores_avg[[x for x in rename_model.values()]].copy()

In [None]:
all_scores_avg["persona_cat"] = all_scores["persona_cat"]

In [None]:
all_scores_avg_stacked = all_scores_avg[all_scores_avg.columns[:-1]].stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Avg. Accuracy'})

In [None]:
all_scores_avg_stacked["persona_cat"] = [persona_cat_dict[persona]  if persona in persona_cat_dict.keys() else "control" for persona in all_scores_avg_stacked["index"]]

In [None]:
all_scores_avg.sort_values("GPT-4")

In [None]:
all_scores_avg.describe()

In [None]:
no_control = all_scores_avg_stacked[~all_scores_avg_stacked["persona_cat"].isin(["control", "empty"])]
only_control =  all_scores_avg_stacked[all_scores_avg_stacked["persona_cat"] == "control"]
only_empty = all_scores_avg_stacked[all_scores_avg_stacked["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Avg. Accuracy", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Avg. Accuracy", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Avg. Accuracy", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
all_scores_avg.corr(numeric_only=True, method="kendall")

In [None]:
all_scores_avg["avg"] = all_scores_avg.mean(1, numeric_only=True)

In [None]:
all_scores_avg.sort_values("avg")

In [None]:
hits_means.idxmax()

In [None]:
hits_means.idxmin()

In [None]:
pos_df = pd.DataFrame(index=hits_means.index)
for col in hits_means.columns[:-2]:
    temp_df = pd.DataFrame()
    sorted = hits_means.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(hits_means)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)

In [None]:
pos_df.loc["a_person_of_atheism_belief"]

In [None]:
pos_df.loc["a_asexual_person"]

In [None]:
pos_df.loc["a_person_with_fascism_ideology"]

#### Persona cat vs control

In [None]:
std_df =  hits_means.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

### Correlation between tasks

In [None]:
truthful_scores = all_scores.iloc[:,:-1].map(lambda x: float(x[0]))
mmlu_scores = all_scores.iloc[:,:-1].map(lambda x: float(x[1]))
bbq_scores = all_scores.iloc[:,:-1].map(lambda x: float(x[2]))

In [None]:
truthful_scores.corrwith(mmlu_scores).sort_values()

In [None]:
truthful_scores.corrwith(bbq_scores).sort_values()

In [None]:
mmlu_scores.corrwith(bbq_scores).sort_values()

In [None]:
scores_by_model = {model: pd.concat([truthful_scores[model], mmlu_scores[model], bbq_scores[model]], axis=1) for model in truthful_scores}

In [None]:
for model in scores_by_model.keys():
    scores_by_model[model]["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in scores_by_model[model].index]
    scores_by_model[model].columns = ["truthfulqa", "mmlu", "bbq", "persona_cat"]

In [None]:
for model, scores in scores_by_model.items():
    for group, df in scores.groupby("persona_cat"):
        corrs = df.corr(method="kendall", numeric_only=True)
        if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
            print(model)
            print(group, df.corr(method="kendall", numeric_only=True))
            df["avg"] = df.mean(1, numeric_only=True)
            print(df.sort_values("avg"))
            print("=========================")

In [None]:
avg_persona_score = pd.concat([x.iloc[:,:-1] for x in scores_by_model.values()], axis=0).reset_index().groupby("index").mean()

In [None]:
avg_persona_score["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in avg_persona_score.index] 

In [None]:
cat_means = avg_persona_score.groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        cat_means["avg"] = cat_means.mean(1, numeric_only=True)
        print(cat_means.sort_values("avg"))
        print("=========================")

In [None]:
for group, df in avg_persona_score.groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group, df.corr(method="kendall", numeric_only=True))
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print(df.sort_values("avg").rank())
        print("=========================")

In [None]:
for model, scores in scores_by_model.items():
    cat_means = scores.groupby("persona_cat").mean()
    corrs = cat_means.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
            cat_means["avg"] = cat_means.mean(1, numeric_only=True)
            print(cat_means.sort_values("avg"))
            print("=========================")

In [None]:
truthful_scores.sort_values("GPT-4")

In [None]:
mmlu_scores.sort_values("GPT-4")

In [None]:
truthful_scores["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in truthful_scores.index]
mmlu_scores["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in truthful_scores.index]
bbq_scores["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in truthful_scores.index]

In [None]:
for group, df in truthful_scores.groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group)
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = truthful_scores.groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
    print(group)
    cat_means["avg"] = cat_means.mean(1, numeric_only=True)
    print(cat_means.sort_values("avg"))
    print("=========================")

In [None]:
for group, df in mmlu_scores.groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group)
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = mmlu_scores.groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
    cat_means["avg"] = cat_means.mean(1, numeric_only=True)
    print(cat_means.sort_values("avg"))
    print("=========================")

In [None]:
for group, df in bbq_scores.groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group, df.corr(method="kendall", numeric_only=True))
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = bbq_scores.groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
    cat_means["avg"] = cat_means.mean(1, numeric_only=True)
    print(cat_means.sort_values("avg"))
    print("=========================")

In [None]:
bias_means = bias_means.iloc[:,:-1]

In [None]:
for group, df in bias_means.groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group, df.corr(method="kendall", numeric_only=True))
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = bias_means.groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
    cat_means["avg"] = cat_means.mean(1, numeric_only=True)
    print(cat_means.sort_values("avg"))
    print("=========================")

In [None]:
del unknown_mean["avg"]

In [None]:
for group, df in unknown_mean.groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group, df.corr(method="kendall", numeric_only=True))
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = unknown_mean.groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
    cat_means["avg"] = cat_means.mean(1, numeric_only=True)
    print(cat_means.sort_values("avg"))
    print("=========================")

In [None]:
for group, df in all_off_means.groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group, df.corr(method="kendall", numeric_only=True))
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = all_off_means.groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
    cat_means["avg"] = cat_means.mean(1, numeric_only=True)
    print(cat_means.sort_values("avg"))
    print("=========================")

In [None]:
for group, df in all_rac_means.groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group, df.corr(method="kendall", numeric_only=True))
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = all_rac_means.groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
    cat_means["avg"] = cat_means.mean(1, numeric_only=True)
    print(cat_means.sort_values("avg"))
    print("=========================")

In [None]:
for group, df in off_aggs.groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group, df.corr(method="kendall", numeric_only=True))
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = off_aggs.groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
    cat_means["avg"] = cat_means.mean(1, numeric_only=True)
    print(cat_means.sort_values("avg"))
    print("=========================")

In [None]:
for group, df in rac_aggs.groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group, df.corr(method="kendall", numeric_only=True))
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = rac_aggs.groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
    cat_means["avg"] = cat_means.mean(1, numeric_only=True)
    print(cat_means.sort_values("avg"))
    print("=========================")

### Refusals

In [None]:
all_refusals_stacked[0]

In [None]:
avg_refusals = pd.concat(all_refusals_stacked).groupby(["index", "Model"]).mean(numeric_only=True).reset_index()
avg_refusals["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in avg_refusals["index"]]

In [None]:
all_refusals_stacked.append(avg_refusals)

In [None]:
refusals_avg = avg_refusals.pivot(index="index", columns=["Model"], values="Refused Fraction")

In [None]:
refusals_avg.corr()

In [None]:
refusals_avg["avg"] = refusals_avg.mean(1)

In [None]:
refusals_avg.sort_values("avg")

In [None]:
refusals_avg.idxmax()

In [None]:
refusals_avg.idxmin()

In [None]:
pos_df = pd.DataFrame(index=refusals_avg.index)
for col in refusals_avg.columns[:-1]:
    temp_df = pd.DataFrame()
    sorted = refusals_avg.sort_values(col, ascending=False).index
    temp_df = temp_df.reindex(index=sorted)
    temp_df[col] = list(range(1, len(refusals_avg)+1))
    pos_df = pos_df.merge(temp_df, left_index=True, right_index=True)
pos_df["avg"] = pos_df.mean(1)

In [None]:
pos_df.sort_values("avg")

#### Persona cat vs control

In [None]:
refusals_avg["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in refusals_avg.index]

In [None]:
refusals_avg = refusals_avg[[x for x in rename_model.values()] + ["avg", "persona_cat"]]

In [None]:
std_df =  refusals_avg.groupby("persona_cat").std().iloc[:,:-1]
std_df_ratios = std_df.iloc[1:]/std_df.iloc[0]
std_df_ratios["avg"] = std_df_ratios.mean(1)
std_df_ratios.loc["avg"] = std_df_ratios.mean(0)
std_df_ratios

In [None]:
std_df

In [None]:
df = std_df_ratios.iloc[:-1,:-1].rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
a = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), log_scale=True)
a.axhline(1, linestyle='--')
a.set_yscale('symlog')
# ax.plot(theta, data.iloc[:,:-1].min(), color="green")
# ax.fill(theta, data.iloc[:,:-1].min(), alpha=0.25, label='_nolegend_', color="green")
# ax.plot(theta, data.iloc[:,:-1].max(), color="red")
# ax.fill(theta, data.iloc[:,:-1].max(), alpha=0.25, label='_nolegend_', color="red")
# add legend relative to top-left plot
plt.xticks(rotation=45)
sns.move_legend(a, "upper left", bbox_to_anchor=(1, 1))

### Correlations betwee tasks

In [None]:
truthfulqa_refusals_mean.corrwith(mmlu_refusals_mean, numeric_only=True).sort_values()

In [None]:
truthfulqa_refusals_mean.corrwith(bbq_refusals_mean, numeric_only=True).sort_values()

In [None]:
mmlu_refusals_mean.corrwith(bbq_refusals_mean, numeric_only=True).sort_values()

In [None]:
for group, df in refusals_avg.groupby("persona_cat"):
    corrs = df.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(group, df.corr(method="kendall", numeric_only=True))
        df["avg"] = df.mean(1, numeric_only=True)
        print(df.sort_values("avg"))
        print("=========================")

In [None]:
cat_means = refusals_avg.groupby("persona_cat").mean()
corrs = cat_means.corr(method="kendall", numeric_only=True)
if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
    cat_means["avg"] = cat_means.mean(1, numeric_only=True)
    print(cat_means.sort_values("avg"))
    print("=========================")

In [None]:
refusals_by_model = {model: pd.concat([x[x.Model == model].set_index("index")["Refused Fraction"] for x in all_refusals_stacked[:6]], axis=1) for model in truthfulqa_refusals_mean.columns[:-2]}

In [None]:
for model in refusals_by_model.keys():
    refusals_by_model[model].columns = ["attitudes", "toxicity", "racism", "truthfulqa", "mmlu", "bbq"]
    refusals_by_model[model]["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in refusals_by_model[model].index]

In [None]:
for model, refusals in refusals_by_model.items():
    for group, df in refusals.groupby("persona_cat"):
        corrs = df.corr(method="kendall", numeric_only=True)
        if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
            print(model)
            print(group, df.corr(method="kendall", numeric_only=True))
            df["avg"] = df.mean(1, numeric_only=True)
            print(df.sort_values("avg"))
            print("=========================")

In [None]:
for model, refusals in refusals_by_model.items():
    cat_means = refusals.groupby("persona_cat").mean()
    corrs = cat_means.corr(method="kendall", numeric_only=True)
    if corrs.to_numpy()[np.triu_indices(len(corrs), k = 1)].mean()>.5:
        print(model)
        cat_means["avg"] = cat_means.mean(1, numeric_only=True)
        print(cat_means.sort_values("avg"))
        print("=========================")

### Aggregate figures

#### Performance

In [None]:
dfs = all_hits_stacked + [all_scores_avg_stacked]

In [None]:
y_titles=[df.columns[2] for df in dfs]

In [None]:
x_titles = ["TruthfulQA", "MMLU", "BBQ", "Average"]

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(8, 8), sharex=True, sharey=True)
plt.subplots_adjust(wspace=.05, hspace=.11)
for idx, df in enumerate(dfs):
    no_control = df[~df["persona_cat"].isin(["control", "empty"])]
    only_control =  df[df["persona_cat"] == "control"]
    only_empty = df[df["persona_cat"] == "empty"]
    ax = axes[idx//2,idx%2]
    # ax, fix = plt.subplot(1, 1, figsize=(4,4))
    ax1 = sns.stripplot(data=no_control, x="Model", y=y_titles[idx], hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), ax=ax)
    offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
    trans = ax.transData
    ax2 = sns.stripplot(data=only_control, x="Model", y=y_titles[idx], color="black", marker="X", transform=trans+offset(10), ax=ax)
    ax3 = sns.stripplot(data=only_empty, x="Model", y=y_titles[idx], color="red", marker="*", size=10, transform=trans+offset(-10), ax=ax)
    control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
    control_label = "control"
    empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
    empty_label = "empty"
    ax.get_legend().set_visible(False)
    # if idx//2 == 1: ax.set_xlabel(plot_x_titles[idx%2])
    # ax.set_ylabel(plot_y_titles[idx//2]) if idx %2 == 0 else ax.set_ylabel("")
    ax.set_title(x_titles[idx])
    ax.set_xlabel("")
    ax.set_ylabel("")
    ax.tick_params(axis='x', labelrotation=45)
handles, labels = ax1.get_legend_handles_labels()
handles = handles + [control_handle, empty_handle]
labels = labels + [control_label, empty_label]
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, +1.05),
          fancybox=True, shadow=True, ncol=4)

In [None]:
# fig, axes = plt.subplots(1, 4, figsize=(16, 4), sharex=True, sharey=True)
# plt.subplots_adjust(wspace=.15, hspace=.1)
# for idx, df in enumerate(dfs):
#     no_control = df[~df["persona_cat"].isin(["control", "empty"])]
#     only_control =  df[df["persona_cat"] == "control"]
#     only_empty = df[df["persona_cat"] == "empty"]
#     ax = axes[idx]
#     # ax, fix = plt.subplot(1, 1, figsize=(4,4))
#     ax1 = sns.stripplot(data=no_control, x="Model", y=y_titles[idx], hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), ax=ax)
#     offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
#     trans = ax.transData
#     ax2 = sns.stripplot(data=only_control, x="Model", y=y_titles[idx], color="black", marker="X", transform=trans+offset(10), ax=ax)
#     ax3 = sns.stripplot(data=only_empty, x="Model", y=y_titles[idx], color="red", marker="*", size=10, transform=trans+offset(-10), ax=ax)
#     control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
#     control_label = "control"
#     empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
#     empty_label = "empty"
#     ax.get_legend().set_visible(False)
#     ax.set_xlabel(x_titles[idx])
#     ax.set_ylabel("")
#     ax.tick_params(axis='x', labelrotation=45)
# handles, labels = ax1.get_legend_handles_labels()
# handles = handles + [control_handle, empty_handle]
# labels = labels + [control_label, empty_label]
# fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, +1.05),
#           fancybox=True, shadow=True, ncol=7)

In [None]:
fig.savefig("../persona-biases-paper/media/performance.pdf",bbox_inches="tight")

##### Persona cat vs control

In [None]:
avg_scores_cat = all_scores_avg_stacked.groupby(["persona_cat", "Model"]).mean(numeric_only=True).reset_index().pivot(columns="Model", values="Avg. Accuracy", index="persona_cat")
avg_scores_cat = avg_scores_cat[rename_model.values()]

In [None]:
avg_scores_cat["Avg."] = avg_scores_cat.mean(1)

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(16, 8), sharex=True)
plt.subplots_adjust(wspace=.2, hspace=.15)
for idx, col in enumerate(avg_scores_cat.columns):
    order = avg_scores_cat.loc[[x for x in avg_scores_cat.index if x not in ["empty", "control"]]].sort_values("Avg.").index
    df = avg_scores_cat.loc[order][col]
    ax = axes[idx//4,idx%4]
 # ax, fix = plt.subplot(1, 1, figsize=(4,4))
    sns.barplot(data=df, ax=ax)
    ax.axhline(avg_scores_cat.loc["empty"][col], linestyle='--', color="red")
    ax.axhline(avg_scores_cat.loc["control"][col], linestyle='--', color="black")
    ax.set_title(col)
    ax.set_ylabel("Avg. Accuracy") if idx%4 == 0 else ax.set_ylabel("")
    ax.set_xlabel("")
    ax.set_ylim((avg_scores_cat[col].min()-0.005*avg_scores_cat[col].min(), avg_scores_cat[col].max()+.005*avg_scores_cat[col].max()))
    ax.tick_params(axis='x', labelrotation=90)
control_handle = Line2D([], [], color="black", marker="", label="control", linestyle="--")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="", label="empty", linestyle="--")
empty_label = "empty"
handles = [control_handle, empty_handle]
labels = [control_label, empty_label]
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, +.95),
          fancybox=True, shadow=True, ncol=2)

In [None]:
fig.savefig("../persona-biases-paper/media/performance_by_cat.pdf",bbox_inches="tight")

In [None]:
len(performance_std_dfs)

In [None]:
performance_std_dfs.append(pd.concat(performance_std_dfs).groupby("persona_cat").mean(0).loc[[x for x in performance_std_dfs[0].index]])

In [None]:
performance_std_dfs[-1]

In [None]:
x_titles = ["TruthfulQA", "MMLU", "BBQ", "Average"]

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(16, 4), sharex=True, sharey=True)
plt.subplots_adjust(wspace=.2, hspace=.1)
for idx, df in enumerate(performance_std_dfs):
    df = df.iloc[1:]/df.iloc[0]
    df =df.rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
    ax = axes[idx]
 # ax, fix = plt.subplot(1, 1, figsize=(4,4))
    ax1 = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), ax=ax)
    ax.axhline(1, linestyle='--')
    ax.set_yscale('symlog')
    ax.get_legend().set_visible(False)
    ax.set_xlabel(x_titles[idx])
    ax.set_ylabel("")
    ax.tick_params(axis='x', labelrotation=45)
handles, labels = ax1.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, +1.05),
          fancybox=True, shadow=True, ncol=7)

In [None]:
fig.savefig("../persona-biases-paper/media/performance_std.pdf",bbox_inches="tight")

#### refusals

In [None]:
all_refusals_stacked[-1] = all_refusals_stacked[-1].set_index(["index", "Model"]).loc[all_refusals_stacked[0].set_index(["index", "Model"]).index].reset_index()

In [None]:
no_control = all_refusals_stacked[-1][~all_refusals_stacked[-1]["persona_cat"].isin(["control", "empty"])]
only_control =  all_refusals_stacked[-1][all_refusals_stacked[-1]["persona_cat"] == "control"]
only_empty = all_refusals_stacked[-1][all_refusals_stacked[-1]["persona_cat"] == "empty"]
# ax, fix = plt.subplot(1, 1, figsize=(4,4))
ax1 = sns.stripplot(data=no_control, x="Model", y="Refused Fraction", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"))
offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
trans = plt.gca().transData
ax2 = sns.stripplot(data=only_control, x="Model", y="Refused Fraction", color="black", marker="X", transform=trans+offset(15))
ax3 = sns.stripplot(data=only_empty, x="Model", y="Refused Fraction", color="red", marker="*", size=10, transform=trans+offset(-15))
plt.xticks(rotation=45)
control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
control_label = "control"
empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
empty_label = "empty"
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles + [control_handle, empty_handle], labels + [control_label, empty_label], title="Persona category")
ax1.set_xlabel("")
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))

In [None]:
ax1.get_figure().savefig("../persona-biases-paper/media/refusals.pdf",bbox_inches="tight")

In [None]:
dfs = all_refusals_stacked[:-1]

In [None]:
# dfs[-1] = dfs[-1].set_index(["index", "Model"]).loc[dfs[0].set_index(["index", "Model"]).index].reset_index()

In [None]:
x_titles = ["Attitudes", "Offensiveness", "Racism", "TruthfulQA", "MMLU", "BBQ"]

In [None]:
fig, axes = plt.subplots(1, 6, figsize=(24, 4), sharex=True)
plt.subplots_adjust(wspace=.2, hspace=.11)
for idx, df in enumerate(dfs):
    no_control = df[~df["persona_cat"].isin(["control", "empty"])]
    only_control =  df[df["persona_cat"] == "control"]
    only_empty = df[df["persona_cat"] == "empty"]
    # ax = axes[idx//3,idx%3]
    ax = axes[idx]
    # ax, fix = plt.subplot(1, 1, figsize=(4,4))
    ax1 = sns.stripplot(data=no_control, x="Model", y="Refused Fraction", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), ax=ax)
    offset = lambda p: transforms.ScaledTranslation(p/72.,0, plt.gcf().dpi_scale_trans)
    trans = ax.transData
    ax2 = sns.stripplot(data=only_control, x="Model", y="Refused Fraction", color="black", marker="X", transform=trans+offset(10), ax=ax)
    ax3 = sns.stripplot(data=only_empty, x="Model", y="Refused Fraction", color="red", marker="*", size=10, transform=trans+offset(-10), ax=ax)
    control_handle = Line2D([], [], color="black", marker="X", label="control", linestyle="None")
    control_label = "control"
    empty_handle = Line2D([], [], color="red", marker="*", label="empty", linestyle="None")
    empty_label = "empty"
    ax.get_legend().set_visible(False)
    ax.set_title(x_titles[idx])
    # if idx%6 != 0: ax.set_ylabel("")
    if idx != 0: ax.set_ylabel("")
    ax.set_xlabel("")
    ax.tick_params(axis='x', labelrotation=45)
handles, labels = ax1.get_legend_handles_labels()
handles = handles + [control_handle, empty_handle]
labels = labels + [control_label, empty_label]
# fig.delaxes(axes[1][3])
fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5, +1.03),
          fancybox=True, shadow=True, ncol=14)

In [None]:
fig.savefig("../persona-biases-paper/media/all_refusals.pdf",bbox_inches="tight")

In [None]:
all_refusals = all_refusals_stacked[-1].pivot(index="index", columns=["Model"], values="Refused Fraction")

In [None]:
all_refusals["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in all_refusals.index]

In [None]:
all_refusals["avg"] = all_refusals.mean(1, numeric_only=True)

In [None]:
corr = all_refusals[all_refusals.persona_cat == "profession_specific"].iloc[:,:-1].corr(numeric_only=True, method="kendall"); corr

In [None]:
corr.to_numpy()[np.triu_indices(len(corr), k = 1)].mean()

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].idxmax()

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].idxmin()

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].sort_values("avg").rank()

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].rank().iloc[:,:-2].mean(1).sort_values()

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].sort_values("avg")["avg"].plot(kind="bar")

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].sort_values("GPT-3.5").tail(5)

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].sort_values("GPT-4").tail(5)

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].sort_values("Gemma-2b-inst").head(1)

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].sort_values("Gemma-7b-inst").tail(1)

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].sort_values("Mistral-inst").tail(3)

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].sort_values("Mixtral").tail(2)

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].boxplot()

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].sort_values("avg")

In [None]:
all_refusals[all_refusals.persona_cat == "profession_specific"].sort_values("avg")["avg"].plot(kind="bar")

In [None]:
corr = all_refusals[all_refusals.persona_cat == "ideology"].iloc[:,:-1].corr(numeric_only=True, method="kendall"); corr

In [None]:
corr.to_numpy()[np.triu_indices(len(corr), k = 1)].mean()

In [None]:
all_refusals[all_refusals.persona_cat == "ideology"].idxmax()

In [None]:
all_refusals[all_refusals.persona_cat == "ideology"].idxmin()

In [None]:
all_refusals[all_refusals.persona_cat == "ideology"].sort_values("avg").rank()

In [None]:
all_refusals[all_refusals.persona_cat == "ideology"].sort_values("avg").rank().iloc[:,:-2].mean(1)

In [None]:
all_refusals[all_refusals.persona_cat == "ideology"].sort_values("avg")

In [None]:
corr = all_refusals[all_refusals.persona_cat == "political_figure"].iloc[:,:-1].corr(numeric_only=True, method="kendall"); corr

In [None]:
corr.to_numpy()[np.triu_indices(len(corr), k = 1)].mean()

In [None]:
all_refusals[all_refusals.persona_cat == "political_figure"].sort_values("avg")

In [None]:
all_refusals[all_refusals.persona_cat == "race"].iloc[:,:-1].corr(numeric_only=True, method="kendall")

In [None]:
all_refusals[all_refusals.persona_cat == "race"].idxmax()

In [None]:
all_refusals[all_refusals.persona_cat == "race"].idxmin()

In [None]:
all_refusals[all_refusals.persona_cat == "race"].sort_values("avg").rank()

In [None]:
all_refusals[all_refusals.persona_cat == "race"].sort_values("avg").rank().iloc[:,:-2].mean(1)

In [None]:
all_refusals[all_refusals.persona_cat == "race"].sort_values("avg")

In [None]:
all_refusals[all_refusals.persona_cat == "sexuality"].iloc[:,:-1].corr(numeric_only=True, method="kendall")

In [None]:
all_refusals[all_refusals.persona_cat == "sexuality"].idxmax()

In [None]:
all_refusals[all_refusals.persona_cat == "sexuality"].idxmin()

In [None]:
all_refusals[all_refusals.persona_cat == "sexuality"].sort_values("avg").rank()

In [None]:
all_refusals[all_refusals.persona_cat == "sexuality"].sort_values("avg").rank().iloc[:,:-2].mean(1)

In [None]:
all_refusals[all_refusals.persona_cat == "sexuality"].sort_values("avg")

In [None]:
(all_refusals.loc["a_gay_person"].iloc[:-2] - all_refusals.loc["a_homosexual_person"].iloc[:-2] )

In [None]:
(all_refusals.loc["a_black_person"].iloc[:-2] - all_refusals.loc["an_african_american_person"].iloc[:-2] ) 

In [None]:
all_refusals.loc["a_black_person"], all_refusals.loc["an_african_american_person"].iloc[:-2] 

##### Persona cat x Control personas

In [None]:
len(refusal_std_dfs)

In [None]:
refusal_std_dfs[0]

In [None]:
refusal_std_dfs.append(pd.concat(refusal_std_dfs).groupby("persona_cat").mean(0).loc[[x for x in refusal_std_dfs[0].index]])

In [None]:
refusal_std_dfs[-1]

In [None]:
x_titles = ["Attitudes", "Offensiveness", "Racism", "TruthfulQA", "MMLU", "BBQ", "Average"]

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(16, 8), sharex=True, sharey=True)
plt.subplots_adjust(wspace=.2, hspace=.1)
for idx, df in enumerate(refusal_std_dfs):
    df = df.iloc[1:]/df.iloc[0]
    df =df.rename(columns=rename_model).stack().reset_index().rename(columns={'level_0':'index','level_1':'Model',0:'Std. Dev. Ratio'})
    ax = axes[idx//4,idx%4]
 # ax, fix = plt.subplot(1, 1, figsize=(4,4))
    ax1 = sns.stripplot(data=df, x="Model", y="Std. Dev. Ratio", hue="persona_cat", marker="o", palette=sns.color_palette("Paired"), ax=ax)
    ax.axhline(1, linestyle='--')
    ax.set_yscale('symlog')
    ax.get_legend().set_visible(False)
    ax.set_title(x_titles[idx])
    ax.set_xlabel("")
    ax.tick_params(axis='x', labelrotation=45)
fig.delaxes(axes[1][3])
handles, labels = ax1.get_legend_handles_labels()
fig.legend(handles, labels, loc='upper left', bbox_to_anchor=(0.72, 0.49),
          fancybox=True, shadow=True, ncol=1)

In [None]:
fig.savefig("../persona-biases-paper/media/refusals_std.pdf",bbox_inches="tight")

In [None]:
(refusal_std_dfs[-1].iloc[1:] > refusal_std_dfs[-1].iloc[0])

In [None]:
(refusal_std_dfs[-1].iloc[1:] > refusal_std_dfs[-1].iloc[0]).sum()

In [None]:
(refusal_std_dfs[-1].iloc[1:] > refusal_std_dfs[-1].iloc[0]).sum(1)

### Aggregated persona cat vs control

In [None]:
all_stds = [att_stds, off_aggs_stds, rac_aggs_stds] + performance_std_dfs[:-1]

In [None]:
all_stds.append(pd.concat(all_stds).groupby("persona_cat").mean())

In [None]:
avg_std_ratio = all_stds[-1]

In [None]:
avg_std_ratio["Avg."] = avg_std_ratio.mean(1)

In [None]:
avg_std_ratio.loc["Avg."] =  avg_std_ratio.mean(0)

In [None]:
avg_std_ratio.sort_values("Avg.")

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(16, 8), sharex=True, sharey=True)
plt.subplots_adjust(wspace=.2, hspace=.1)
for idx, col in enumerate(avg_std_ratio.columns):
    order = avg_std_ratio.loc[[persona for persona in set(persona_cat_dict.values()) if persona != "empty"]].sort_values("Avg.").index
    df = avg_std_ratio.loc[order][col]
    ax = axes[idx//4,idx%4]
 # ax, fix = plt.subplot(1, 1, figsize=(4,4))
    sns.barplot(data=df, ax=ax)
    ax.axhline(avg_std_ratio.loc["control"][col], linestyle='--', color="red")
    ax.set_title(col)
    ax.set_ylabel("Refused Fraction")
    ax.set_xlabel("")
    ax.tick_params(axis='x', labelrotation=90)

In [None]:
fig.savefig("../persona-biases-paper/media/persona_cat_stds.pdf", bbox_inches="tight")

# Personas closest to empty

## Attitudes

In [None]:
att_scores = get_results_df("attitude_scores.csv")

In [None]:
att_scores  = att_scores[[x for x in rename_model.values()]]

In [None]:
att_distance=pd.DataFrame(index=att_scores.index)

In [None]:
for model in att_scores:
    att_distance[model] = euclidean_distances(X=normalize(att_scores[model].loc["empty"].reshape((1,-1))), Y=normalize(np.vstack(att_scores[model].values)))[0]

In [None]:
att_distance["avg"] = att_distance.mean(1)

In [None]:
att_distance = att_distance.round(4)

In [None]:
att_distance.sort_values("avg")

In [None]:
att_distance.corr(method="kendall")

In [None]:
for column in att_distance.columns[:-1]:
    print(column)
    sorted = att_distance.sort_values(column)
    print("Top 5 similar:")
    print(sorted[1:6][column])
    print("Bottom 5 similar:")
    print(sorted[-5:][column])
    print("=====================")

## Toxicity

In [None]:
off_scores = get_results_df("off_scores.csv").iloc[:,:-1]

In [None]:
rac_scores = get_results_df("rac_scores.csv").iloc[:,:-1]

In [None]:
np.vstack(off_scores["Mixtral"].values).shape

In [None]:
def concat(x, y):
    data = np.hstack((np.vstack(x.to_numpy()), np.vstack(y.to_numpy())))
    return pd.Series(data=[x for x in data], index=off_scores.index)

In [None]:
tox_scores = off_scores.combine(rac_scores,concat)

In [None]:
tox_scores  = tox_scores[[x for x in rename_model.values()]]

In [None]:
tox_distance=pd.DataFrame(index=tox_scores.index)

In [None]:
for model in tox_scores:
    tox_distance[model] = euclidean_distances(X=normalize(tox_scores[model].loc["empty"].reshape((1,-1))), Y=normalize(np.vstack(tox_scores[model].values)))[0]

In [None]:
tox_distance["avg"] = tox_distance.mean(1)

In [None]:
tox_distance = tox_distance.round(4)

In [None]:
tox_distance.sort_values("avg")

In [None]:
tox_distance.corr(method="kendall")

In [None]:
for column in tox_distance.columns[:-1]:
    print(column)
    sorted = tox_distance.sort_values(column)
    print("Top 5 similar:")
    print(sorted[1:6][column])
    print("Bottom 5 similar:")
    print(sorted[-5:][column])
    print("=====================")

## Truthfulqa

In [None]:
truthfulqa_hits = get_results_df("truthfulqa_hits.csv").iloc[:,:-1]

In [None]:
truthfulqa_hits  = truthfulqa_hits[[x for x in rename_model.values()]]

In [None]:
truthfulqa_distance=pd.DataFrame(index=truthfulqa_hits.index)

In [None]:
for model in truthfulqa_hits:
    truthfulqa_distance[model] = euclidean_distances(X=normalize(truthfulqa_hits[model].loc["empty"].reshape((1,-1))), Y=normalize(np.vstack(truthfulqa_hits[model].values)))[0]

In [None]:
truthfulqa_distance["avg"] = truthfulqa_distance.mean(1)

In [None]:
truthfulqa_distance = truthfulqa_distance.round(4)

In [None]:
truthfulqa_distance.sort_values("avg")

In [None]:
truthfulqa_distance.corr(method="kendall")

In [None]:
for column in truthfulqa_distance.columns[:-1]:
    print(column)
    sorted = truthfulqa_distance.sort_values(column)
    print("Top 5 similar:")
    print(sorted[1:6][column])
    print("Bottom 5 similar:")
    print(sorted[-5:][column])
    print("=====================")

## MMLU

In [None]:
mmlu_hits = get_results_df("mmlu_hits.csv").iloc[:,:-1]

In [None]:
mmlu_hits  = mmlu_hits[[x for x in rename_model.values()]]

In [None]:
mmlu_distance=pd.DataFrame(index=mmlu_hits.index)

In [None]:
for model in mmlu_hits:
    mmlu_distance[model] = euclidean_distances(X=normalize(mmlu_hits[model].loc["empty"].reshape((1,-1))), Y=normalize(np.vstack(mmlu_hits[model].values)))[0]

In [None]:
mmlu_distance["avg"] = mmlu_distance.mean(1)

In [None]:
mmlu_distance = mmlu_distance.round(4)

In [None]:
mmlu_distance.sort_values("avg")

In [None]:
mmlu_distance.corr(method="kendall")

In [None]:
for column in mmlu_distance.columns[:-1]:
    print(column)
    sorted = mmlu_distance.sort_values(column)
    print("Top 5 similar:")
    print(sorted[1:6][column])
    print("Bottom 5 similar:")
    print(sorted[-5:][column])
    print("=====================")

## BBQ

In [None]:
bbq_hits = get_results_df("bbq_hits.csv").iloc[:,:-1]

In [None]:
bbq_hits  = bbq_hits[[x for x in rename_model.values()]]

In [None]:
bbq_distance=pd.DataFrame(index=bbq_hits.index)

In [None]:
for model in bbq_hits:
    bbq_distance[model] = euclidean_distances(X=normalize(bbq_hits[model].loc["empty"].reshape((1,-1))), Y=normalize(np.vstack(bbq_hits[model].values)))[0]

In [None]:
bbq_distance["avg"] = bbq_distance.mean(1)

In [None]:
bbq_distance = bbq_distance.round(4)

In [None]:
bbq_distance.sort_values("avg")

In [None]:
bbq_distance.corr(method="kendall")

In [None]:
for column in bbq_distance.columns[:-1]:
    print(column)
    sorted = bbq_distance.sort_values(column)
    print("Top 5 similar:")
    print(sorted[1:6][column])
    print("Bottom 5 similar:")
    print(sorted[-5:][column])
    print("=====================")

## Aggregate

In [None]:
all_df = pd.concat([att_distance, tox_distance, truthfulqa_distance, mmlu_distance, bbq_distance])

In [None]:
all_df = all_df.reset_index().groupby("index").mean()

In [None]:
all_df.corr(method="kendall")

In [None]:
for column in all_df.columns[:-1]:
    print(column)
    sorted = all_df.sort_values(column)
    print("Top 5 similar:")
    print(sorted[1:6][column])
    print("Bottom 5 similar:")
    print(sorted[-5:][column])
    print("=====================")

In [None]:
all_df.sort_values(model)[1:6][model].index.tolist()

In [None]:
table = pd.DataFrame(index=all_df.columns[:-1])
for model in all_df.columns[:-1]:
    table.loc[model, "Most similar"] = ", ".join(all_df.sort_values(model)[1:6][model].index.tolist())
    table.loc[model, "Least similar"] = ", ".join(all_df.sort_values(model)[-5:][model].index.tolist())

In [None]:
table = table.map(lambda x: x.replace("_", " "))

In [None]:
print(table.to_latex())

In [None]:
all_df["persona_cat"] = [persona_cat_dict[persona] if persona in persona_cat_dict else "control" for persona in all_df.index ]

In [None]:
del all_df["avg"]

In [None]:
all_df

In [None]:
all_df.boxplot(by="persona_cat", rot=90, figsize=(15,15))