In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import seaborn as sns
import numpy as np
from data.personas import *
from data.constants import DATASETS, MODEL_ORDER, CATEGORY_ORDER
from utils.significance_testing import *
from utils.metrics import *
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import pickle

In [None]:
prefix = "./results"

In [None]:
all_pvalues = pickle.load(open(f"{prefix}/all_pvalues.pkl", "rb"))
all_metrics = pickle.load(open(f"{prefix}/all_metrics.pkl", "rb"))
all_results = pickle.load(open(f"{prefix}/all_results.pkl", "rb"))
fidelity_significance = pickle.load(open(f"{prefix}/fidelity_significances.pkl", "rb"))
fidelity_intervals = pickle.load(open(f"{prefix}/fidelity_intervals.pkl", "rb"))

In [None]:
import matplotlib.font_manager as fm
fm.fontManager.addfont("/usr/share/fonts/truetype/cmu/cmunrm.ttf")   # regular
fm.fontManager.addfont("/usr/share/fonts/truetype/cmu/cmunbx.ttf")   # bold
bold_font = fm.FontProperties(fname="/usr/share/fonts/truetype/cmu/cmunbx.ttf")
cmu_serif = fm.FontProperties(fname="/usr/share/fonts/truetype/cmu/cmunrm.ttf").get_name()
print("Font name:", cmu_serif)  # should be "CMU Serif"

In [None]:
plt.rcParams.update({
    "text.usetex": False,  # Enable LaTeX
    "mathtext.fontset": "cm",  # Use Computer Modern (LaTeX default)
    "font.family": cmu_serif,
    "font.size": 14,         # Base font size
    "axes.titlesize": 16,    # Title font size
    "axes.labelsize": 14,    # Axis label font size
    "xtick.labelsize": 12,   # X-axis tick font size
    "ytick.labelsize": 12,   # Y-axis tick font size
    "legend.fontsize": 14    # Legend font size
})

In [None]:
fig, axes = plt.subplots(1, 4, sharex=True, sharey=True, figsize=(20,15), gridspec_kw={})
name_mapping = {"static": "Static\n(e.g., expert in fact-checking)", "level1": "Broad\n(e.g., expert in math)", "level2": "Focused\n(e.g., expert in abstract algebra)", "level3": "Niche\n(e.g., expert in group theory)"}
for idx, expert in enumerate(["static", "level1", "level2", "level3"]):
    ax = axes[idx]
    op_df = pd.DataFrame(columns=MODEL_ORDER)
    pvalues =  pd.DataFrame(columns=MODEL_ORDER)
    if expert=="static":
        for dataset, metrics in all_results.items():
            op_df.loc[dataset] = all_metrics[dataset].loc["OP"]
            df = all_pvalues[dataset]
            pvalues.loc[dataset] = df.loc["in-expert"]
    else:
        for dataset, metrics in all_results.items():
            op_df.loc[dataset] = metrics.loc[expert] - metrics.loc["empty"]
            df = all_pvalues[dataset]
            pvalues.loc[dataset] = df.loc[expert]
    # op_df.loc["Avg."] = op_df.mean(0)
    # op_df["Avg."] = op_df.mean(1)
    # pvalues.loc["Avg."] = op_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=0)
    # pvalues["Avg."] = op_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=1)
    op_masked = op_df.mask(pvalues > .05, np.nan)
    # op_masked = op_masked.map(lambda x: 0 if x > 0 else (-1 if x < 0 else np.nan))
    op_masked = op_masked.rename(columns=lambda x: x.rsplit("-", maxsplit=1)[0])
    op_masked = op_masked.rename(index=lambda x: x.replace("contextual_parametric_", ""))
    sns.heatmap(op_masked, cmap=sns.diverging_palette(260, 30, s=100, center='light', as_cmap=True), vmin=-.1, vmax=.1,center=0, cbar=False,annot=op_df*100, fmt=".1f", square=True, linewidth=1, linecolor="gray", ax=ax)
    x= np.arange(op_masked.shape[1]+1)
    y= np.arange(op_masked.shape[0]+1)
    pvalues = pvalues.map(lambda x: x if x <0.05 else 1)
    zm = np.ma.masked_less(pvalues.loc[op_df.index].values, 0.05)
    ax.pcolor(x, y, zm , cmap="Grays_r", alpha=.02)
    ax.vlines([0, 3, 6, 9], *ax.get_ylim(), lw=1, color="black")
    ax.hlines([0, 1, 2, 16,20,27], *ax.get_ylim(), lw=1, color="black")
    ax.set_title(name_mapping[expert],fontproperties=bold_font,fontsize=18)

    if idx//2 == 0: ax.get_xaxis().set_ticks([])
fig.suptitle("Base prompt", fontproperties=bold_font, y=.93,fontsize=20)
plt.subplots_adjust(wspace=.05, hspace=0)
#fig.tight_layout()

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/expertise_tasks_heatmap.pdf", bbox_inches="tight")

In [None]:
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(10.5,15), gridspec_kw={})
category_mapping = {"color": COLOR_PERSONAS, "name": NAMES}
# plt.subplots_adjust(wspace=.15, hspace=.001)
robs_df = {}
for idx, rob in enumerate(["color", "name"]):
    ax = axes[idx]
    rob_df = pd.DataFrame(columns=MODEL_ORDER)
    pvalues =  pd.DataFrame(columns=MODEL_ORDER)
    for dataset, metrics in all_results.items():
        rob_df.loc[dataset] = all_metrics[dataset].loc[f"WU_{rob}"]
        df = all_pvalues[dataset]
        worst = worst_case_utility(all_results[dataset], category_mapping[rob], return_persona=True)[1]
        for model in MODEL_ORDER:
            pvalues.loc[dataset, model] = df.loc[worst[model], model]
    robs_df[rob] = rob_df
    # rob_df.loc["Avg."] = rob_df.mean(0)
    # rob_df["Avg."] = rob_df.mean(1)
    # pvalues.loc["Avg."] = rob_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=0)
    # pvalues["Avg."] = rob_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=1)
    rob_masked = rob_df.mask(pvalues > .05, np.nan)
    # rob_masked = rob_masked.map(lambda x: 0 if x > 0 else (-1 if x < 0 else np.nan))
    rob_masked = rob_masked.rename(columns=lambda x: x.rsplit("-", maxsplit=1)[0])
    rob_masked = rob_masked.rename(index=lambda x: x.replace("contextual_parametric_", ""))
    sns.heatmap(rob_masked,cmap=sns.diverging_palette(260, 30, s=100, center='light', as_cmap=True), vmin=-.1, vmax=.1,center=0, cbar=False,annot=rob_df*100, fmt=".1f", square=True, linewidth=1, linecolor="gray", ax=ax)
    x= np.arange(rob_masked.shape[1]+1)
    y= np.arange(rob_masked.shape[0]+1)
    pvalues = pvalues.map(lambda x: x if x <0.05 else 1)
    zm = np.ma.masked_less(pvalues.loc[rob_df.index].values, 0.05)
    ax.pcolor(x, y, zm , cmap="Grays_r", alpha=.02)
    ax.vlines([0, 3, 6, 9], *ax.get_ylim(), lw=1, color="black")
    ax.hlines([0, 5], *ax.get_xlim(), lw=1, color="black")
    ax.hlines([0, 1, 2, 16,20,27], *ax.get_ylim(), lw=1, color="black")
    ax.set_title(rob.capitalize(),fontproperties=bold_font,fontsize=18)
    # if idx//2 == 0: ax.get_xaxis().set_ticks([])
fig.suptitle("Base prompt", fontproperties=bold_font, y=.93,fontsize=20)
plt.subplots_adjust(wspace=.05, hspace=0)
#fig.tight_layout()

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/robustness_tasks_heatmap.pdf", bbox_inches="tight")

In [None]:
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(15,15))
category_mapping = {"Exp. Domain": "Exp", "Exp. Specialization": "ExpLevel", "Education": "Ed"}
fid_dfs = {}
# plt.subplots_adjust(wspace=.15, hspace=.001)
for idx, fid in enumerate(["Exp. Domain", "Exp. Specialization", "Education"]):
    ax = axes[idx]
    fid_df = pd.DataFrame(columns=MODEL_ORDER)
    pvalues =  pd.DataFrame(columns=MODEL_ORDER)
    for dataset, metrics in all_results.items():
        fid_df.loc[dataset] = all_metrics[dataset].loc[f"Fid_{category_mapping[fid]}"]
        pvalues.loc[dataset] = fidelity_significance[dataset].loc[f"Fid_{category_mapping[fid]}"]
    fid_dfs[fid]=fid_df
    # fid_df.loc["Avg."] = fid_df.mean(0)
    # fid_df["Avg."] = fid_df.mean(1)
    # pvalues.loc["Avg."] = fid_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=0)
    # pvalues["Avg."] = fid_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=1)
    fid_masked = fid_df.mask(pvalues ==False, np.nan)
    # rob_masked = rob_masked.map(lambda x: 0 if x > 0 else (-1 if x < 0 else np.nan))
    fid_masked = fid_masked.rename(columns=lambda x: x.rsplit("-", maxsplit=1)[0])
    fid_masked = fid_masked.rename(index=lambda x: x.replace("contextual_parametric_", ""))
    sns.heatmap(fid_masked,cmap=sns.diverging_palette(260, 30, s=100, center='light', as_cmap=True),center=0, cbar=False,annot=fid_df*100, fmt=".0f", square=True, linewidth=1, linecolor="gray", ax=ax)
    x= np.arange(rob_masked.shape[1]+1)
    y= np.arange(rob_masked.shape[0]+1)
    pvalues = pvalues.map(lambda x: x if x <0.05 else 1)
    zm = np.ma.masked_less(pvalues.loc[fid_df.index].values, 0.05)
    ax.vlines([0, 3, 6, 9], *ax.get_ylim(), lw=1, color="black")
    ax.hlines([0, 1, 2, 16,20,27], *ax.get_ylim(), lw=1, color="black")
    ax.set_title(fid.capitalize(), fontproperties=bold_font, fontsize=18)
    # if idx//2 == 0: ax.get_xaxis().set_ticks([])
fig.suptitle("Base prompt", fontproperties=bold_font, fontsize=20,y=.93)
plt.subplots_adjust(wspace=.05, hspace=0)
#fig.tight_layout()

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/fidelity_tasks_heatmap.pdf", bbox_inches="tight")

In [None]:
def compute_r2(model, df):
    fixed_effects_variance = np.var(model.predict(df))
    print(f"fixed effects variance: {fixed_effects_variance}")
    random_effects_variance = model.cov_re.iloc[0, 0]
    print(f"random effects variance: {random_effects_variance}")
    residual_variance = model.scale
    print(f"Residual variance: {residual_variance}")
    
    # Calculate Marginal and Conditional R^2 from theese extracted variances:
    R2_m = fixed_effects_variance / (fixed_effects_variance + random_effects_variance + residual_variance)
    R2_c = (fixed_effects_variance + random_effects_variance) / (fixed_effects_variance + random_effects_variance + residual_variance)
    return R2_m, R2_c

In [None]:
all_df = pd.DataFrame()
for k, v in all_results.items():
    df = v.copy()
    # df = df[[x for x in MODEL_ORDER if x != "Llama-3.2-3B-Instruct"]]
    df = df.stack().reset_index()
    df.columns = ["persona", "model", "score"]
    df = df[df.persona.isin(EDUCATION_PERSONAS + COLOR_PERSONAS + NAMES + ["empty", "in-expert", "experts", "out-expert", "level1", "level2", "level3" ])]
    df["dataset"] = k
    all_df = pd.concat([all_df, df], axis=0)

In [None]:
all_df["modelDataset"] = all_df["model"] + all_df["dataset"].astype(str)

In [None]:
rename_rules = {
    "in-expert": r"$\text{exp}$",
    "out-expert": r"${\neg}\text{exp}$",
    "experts": r"${\sim}\text{exp}$",
    "level1": r"$\text{exp}_\text{Broad}$",
    "level2": r"$\text{exp}_\text{Focused}$",
    "level3": r"$\text{exp}_\text{Niche}$",
}


In [None]:
def plot_errorbars(table, figsize=(8,8), x_label="Estimated persona effect"):
    table = table.rename(index=lambda x: x.split("T.")[-1].rstrip("]"))
    table = table.rename(index=lambda x: rename_rules[x] if x in rename_rules else x)
    table = table.rename(index=lambda x: x.replace("an ", "").replace("a ", "").replace("person with ", "").replace(" person", "").replace(" level education", "").replace("-level education", ""))
    f, axs = plt.subplots(1, figsize=figsize,layout="tight")
    sns.stripplot(table, y=table.sort_values(["Coef.", "P>|z|"]).index, x=table.sort_values("Coef.")["Coef."], ax=axs, s=5)
    for idx, (_, row) in enumerate(table.sort_values("Coef.").iterrows()):
        axs.plot([row["[0.025"], row["0.975]"]],[idx, idx], 'b-|', markersize=5)
    axs.axvline(x=0.00,color='black',linewidth=1,linestyle='--')
    axs.set_ylabel("")
    axs.set_xlabel(x_label)
    return axs

In [None]:
cats = []
for p in all_df.persona.tolist():
    if p in COLOR_PERSONAS: cats.append("color")
    elif p in NAMES: cats.append("name")
    # elif p in EDUCATION_PERSONAS: cats.append("education")
    else: cats.append(p)

In [None]:
all_df["category"] = cats

In [None]:
md = smf.mixedlm("score ~ C(category, Treatment(reference='empty'))", all_df, groups=all_df["modelDataset"])

In [None]:
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

In [None]:
table = mdf.summary().tables[1]

In [None]:
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")

In [None]:
ax = plot_errorbars(table, (4,3))

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/regression_coefs.pdf", bbox_inches="tight")

In [None]:
marg, cond = compute_r2(mdf, all_df)

In [None]:
marg, cond, marg/(1- cond + marg)

In [None]:
large_only = all_df[all_df.model.isin([MODEL_ORDER[5], MODEL_ORDER[-1]])]

In [None]:
md = smf.mixedlm("score ~ C(category, Treatment(reference='empty'))", large_only, groups=large_only["modelDataset"])

In [None]:
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

In [None]:
table = mdf.summary().tables[1]
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")

In [None]:
ax = plot_errorbars(table, (6,4))

#### Scale effect on OP

In [None]:
sizes =  [1, 2, 3, 1, 2, 4, 1, 2, 4]
size_map = {k:v  for v, k in zip(sizes, MODEL_ORDER)}

In [None]:
all_df["size"] = all_df.model.apply(lambda x: size_map[x]).astype("int")

In [None]:
families = ["gemma", "llama", "qwen"]

In [None]:
all_df["family"] = all_df.model.apply(lambda x: families[MODEL_ORDER.index(x) // 3])

In [None]:
all_df["familyDataset"] = all_df["family"] + all_df["dataset"].astype(str)

In [None]:
expert_df = all_df[all_df["persona"].isin(["in-expert", "empty"])]

over_df = expert_df[expert_df["persona"] == "in-expert"].copy()

over_df.score = expert_df[expert_df["persona"] == "in-expert"].score-  expert_df[expert_df["persona"] == "empty"].score.values

md = smf.mixedlm("score ~ size", over_df, groups=over_df["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

#### Scale effect on OP (broad)

In [None]:
expert_df = all_df[all_df["persona"].isin(["level1", "empty"])]

broad_df = expert_df[expert_df["persona"] == "level1"].copy()

broad_df.score =expert_df[expert_df["persona"] == "level1"].score-  expert_df[expert_df["persona"] == "empty"].score.values

md = smf.mixedlm("score ~ size", broad_df, groups=broad_df["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

#### Scale effect on OP (Focused)

In [None]:
expert_df = all_df[all_df["persona"].isin(["level2", "empty"])]

focused_df = expert_df[expert_df["persona"] == "level2"].copy()

focused_df.score =expert_df[expert_df["persona"] == "level2"].score-  expert_df[expert_df["persona"] == "empty"].score.values

md = smf.mixedlm("score ~ size", focused_df, groups=focused_df["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

#### Scale effect on OP (niche)

In [None]:
expert_df = all_df[all_df["persona"].isin(["level3", "empty"])]

niche_df = expert_df[expert_df["persona"] == "level3"].copy()

niche_df.score =expert_df[expert_df["persona"] == "level3"].score-  expert_df[expert_df["persona"] == "empty"].score.values

md = smf.mixedlm("score ~ size", niche_df, groups=niche_df["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

#### Scale effect on Robustness (color)

In [None]:
color_rob = all_df.drop_duplicates("modelDataset")

color_rob.loc[:,"score"] = [robs_df["color"].loc[row["dataset"], row["model"]] for _, row  in color_rob.iterrows()]

md = smf.mixedlm("score ~ size", color_rob, groups=color_rob["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Robustness (name)

In [None]:
name_rob = all_df.drop_duplicates("modelDataset")

name_rob.loc[:,"score"] = [robs_df["name"].loc[row["dataset"], row["model"]] for _, row  in name_rob.iterrows()]

md = smf.mixedlm("score ~ size", name_rob, groups=name_rob["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Fidelity (education)

In [None]:
education_fid = all_df.drop_duplicates("modelDataset")

education_fid.loc[:,"score"] = [fid_dfs["Education"].loc[row["dataset"], row["model"]] for _, row  in education_fid.iterrows()]

md = smf.mixedlm("score ~ size", education_fid, groups=education_fid["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Fidelity (expertise)

In [None]:
expert_fid = all_df.drop_duplicates("modelDataset")

expert_fid.loc[:,"score"] = [fid_dfs["Exp. Domain"].loc[row["dataset"], row["model"]] for _, row  in expert_fid.iterrows()]

md = smf.mixedlm("score ~ size", expert_fid, groups=expert_fid["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Fidelity (expertise level)

In [None]:
expert_level_fid = all_df.drop_duplicates("modelDataset")

expert_level_fid.loc[:,"score"] = [fid_dfs["Exp. Specialization"].loc[row["dataset"], row["model"]] for _, row  in expert_level_fid.iterrows()]

md = smf.mixedlm("score ~ size", expert_level_fid, groups=expert_level_fid["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Combined scale effect

In [None]:
scale_dfs = [over_df, broad_df, focused_df, niche_df, color_rob, name_rob, expert_fid, education_fid, expert_level_fid]

In [None]:
metrics = ["Exp. Advant.\n(static)", "Exp. Advant.\n(broad)", "Exp. Advant.\n(focused)", "Exp. Advant.\n(niche)", "Robustness\n(color)", "Robustness\n(name)", "Fidelity\n(domain match)", "Fidelity\n(education)", "Fidelity\n(expertise level)"]

In [None]:
all_coefs = pd.DataFrame()
for idx, df in enumerate(scale_dfs):
    md = smf.mixedlm("score ~ size", df, groups=df["familyDataset"])
    
    mdf = md.fit()
    coefs = mdf.summary().tables[1]
    coefs["Coef."] = coefs["Coef."].astype("float")
    coefs = coefs.rename(index=lambda x: metrics[idx] if x == "size" else x)
    all_coefs = pd.concat([all_coefs, coefs.loc[[metrics[idx]]]], axis=0)

In [None]:
all_coefs = all_coefs.sort_values("Coef.")

In [None]:
table= all_coefs.astype("float")

In [None]:
metric_categories = ["Exp. Advant.", "Robustness", "Fidelity"]

In [None]:
table["metric"] = table.index.map(lambda x: x.split("\n")[0]).astype(pd.CategoricalDtype(categories=metric_categories, ordered=True))

In [None]:
table = table.sort_values("metric")

In [None]:
table

In [None]:
table.metric=table.metric.apply(lambda x: "Robustness or Fidelity" if x in ["Robustness", "Fidelity"] else x)

In [None]:
table = table.rename(index=lambda x: x.replace("expertise", "exp.").replace(" match", ""))

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(7, 5))
for idx, metric in enumerate(["Exp. Advant.", "Robustness or Fidelity"]):
    data = table[table.metric == metric]
    ax = axes[idx]
    ax1 = sns.stripplot(data, x=data.sort_values(["Coef.", "P>|z|"]).index, y=data.sort_values("Coef.")["Coef."], ax=ax, s=5)
    for idx, (_, row) in enumerate(data.sort_values("Coef.").iterrows()):
        ax.plot([idx, idx], [row["[0.025"], row["0.975]"]], 'b-_', markersize=7)
    ax1.set_ylabel("")
    ax1.set_xlabel("")
    ax1.axhline(y=0.00,color='black',linewidth=1,linestyle='--')
    # if idx!= 2: ax1.set_xlabel("")
    # else: ax1.set_xlabel("Metrics")
    # ax1.ticklabel_format(axis='x', style='scientific', scilimits=(0, 0))
    # ax1.set_xlabel("Effect")
    # ax1.tick_params(axis='x', labelrotation=90)
    ax.set_xticklabels(ax.get_xticklabels(), fontproperties=bold_font, fontsize=14)
fig.supylabel("Model scale effect", fontproperties=bold_font, fontsize=16)
plt.subplots_adjust(wspace=0., hspace=.4, left=.12)
# fig.delaxes(axes[0][2])

In [None]:
#fig.savefig("../persona_performance_paper/media/scale_coefs.pdf", bbox_inches="tight")

#### Effect of domain matchiness

In [None]:
levels = {"out-expert": 0, "experts": 1, "in-expert": 2}

domain_df = all_df[all_df.persona.isin(levels.keys())].copy()

domain_df.persona = domain_df.persona.apply(lambda x:  levels[x])

np.unique(domain_df.persona)

md = smf.mixedlm("score ~ persona", domain_df, groups=domain_df["modelDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, domain_df)

marg, cond, marg/(1- cond + marg)

#### Effect of expertise-level

In [None]:
levels = [f"level{i}" for i in range(1,4)]

level_df = all_df[all_df.persona.isin(levels)].copy()

level_df.persona = level_df.persona.str[-1].astype(int) -1

md = smf.mixedlm("score ~ persona",level_df, groups=level_df["modelDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, level_df)

marg, cond, marg/(1- cond + marg)

#### Effect of education

In [None]:
edu_df = all_df[all_df.persona.str.contains("educ")].copy()

edu_df.persona = edu_df.persona.apply(lambda x: EDUCATION_PERSONAS.index(x)).astype(int)

md = smf.mixedlm("score ~ persona",edu_df, groups=edu_df["modelDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, edu_df)

marg, cond, marg/(1- cond + marg)

### Instruction

In [None]:
prefix = "./results/instruction"

In [None]:
all_pvalues = pickle.load(open(f"{prefix}/all_pvalues.pkl", "rb"))
all_metrics = pickle.load(open(f"{prefix}/all_metrics.pkl", "rb"))
all_results = pickle.load(open(f"{prefix}/all_results.pkl", "rb"))
fidelity_significance = pickle.load(open(f"{prefix}/fidelity_significances.pkl", "rb"))
fidelity_intervals = pickle.load(open(f"{prefix}/fidelity_intervals.pkl", "rb"))


In [None]:
fig, axes = plt.subplots(1, 4, sharex=True, sharey=True, figsize=(20,15), gridspec_kw={})
name_mapping = {"static": "Static\n(e.g., expert in fact-checking)", "level1": "Broad\n(e.g., expert in math)", "level2": "Focused\n(e.g., expert in abstract algebra)", "level3": "Niche\n(e.g., expert in group theory)"}
for idx, expert in enumerate(["static", "level1", "level2", "level3"]):
    ax = axes[idx]
    op_df = pd.DataFrame(columns=MODEL_ORDER)
    pvalues =  pd.DataFrame(columns=MODEL_ORDER)
    if expert=="static":
        for dataset, metrics in all_results.items():
            op_df.loc[dataset] = all_metrics[dataset].loc["OP"]
            df = all_pvalues[dataset]
            pvalues.loc[dataset] = df.loc["in-expert"]
    else:
        for dataset, metrics in all_results.items():
            op_df.loc[dataset] = metrics.loc[expert] - metrics.loc["empty"]
            df = all_pvalues[dataset]
            pvalues.loc[dataset] = df.loc[expert]
    # op_df.loc["Avg."] = op_df.mean(0)
    # op_df["Avg."] = op_df.mean(1)
    # pvalues.loc["Avg."] = op_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=0)
    # pvalues["Avg."] = op_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=1)
    op_masked = op_df.mask(pvalues > .05, np.nan)
    # op_masked = op_masked.map(lambda x: 0 if x > 0 else (-1 if x < 0 else np.nan))
    op_masked = op_masked.rename(columns=lambda x: x.rsplit("-", maxsplit=1)[0])
    op_masked = op_masked.rename(index=lambda x: x.replace("contextual_parametric_", ""))
    sns.heatmap(op_masked, cmap=sns.diverging_palette(260, 30, s=100, center='light', as_cmap=True), vmin=-.1, vmax=.1,center=0, cbar=False,annot=op_df*100, fmt=".1f", square=True, linewidth=1, linecolor="gray", ax=ax)
    x= np.arange(op_masked.shape[1]+1)
    y= np.arange(op_masked.shape[0]+1)
    pvalues = pvalues.map(lambda x: x if x <0.05 else 1)
    zm = np.ma.masked_less(pvalues.loc[op_df.index].values, 0.05)
    ax.pcolor(x, y, zm , cmap="Grays_r", alpha=.02)
    ax.vlines([0, 3, 6, 9], *ax.get_ylim(), lw=1, color="black")
    ax.hlines([0, 1, 2, 16,20,27], *ax.get_ylim(), lw=1, color="black")
    ax.set_title(name_mapping[expert],fontproperties=bold_font,fontsize=18)

    if idx//2 == 0: ax.get_xaxis().set_ticks([])
fig.suptitle("Instruction", fontproperties=bold_font, y=.93,fontsize=20)
plt.subplots_adjust(wspace=.05, hspace=0)
#fig.tight_layout()

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/expertise_inst_tasks_heatmap.pdf", bbox_inches="tight")

In [None]:
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(10.5,15), gridspec_kw={})
category_mapping = {"color": COLOR_PERSONAS, "name": NAMES}
# plt.subplots_adjust(wspace=.15, hspace=.001)
robs_df = {}
for idx, rob in enumerate(["color", "name"]):
    ax = axes[idx]
    rob_df = pd.DataFrame(columns=MODEL_ORDER)
    pvalues =  pd.DataFrame(columns=MODEL_ORDER)
    for dataset, metrics in all_results.items():
        rob_df.loc[dataset] = all_metrics[dataset].loc[f"WU_{rob}"]
        df = all_pvalues[dataset]
        worst = worst_case_utility(all_results[dataset], category_mapping[rob], return_persona=True)[1]
        for model in MODEL_ORDER:
            pvalues.loc[dataset, model] = df.loc[worst[model], model]
    robs_df[rob] = rob_df
    # rob_df.loc["Avg."] = rob_df.mean(0)
    # rob_df["Avg."] = rob_df.mean(1)
    # pvalues.loc["Avg."] = rob_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=0)
    # pvalues["Avg."] = rob_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=1)
    rob_masked = rob_df.mask(pvalues > .05, np.nan)
    # rob_masked = rob_masked.map(lambda x: 0 if x > 0 else (-1 if x < 0 else np.nan))
    rob_masked = rob_masked.rename(columns=lambda x: x.rsplit("-", maxsplit=1)[0])
    rob_masked = rob_masked.rename(index=lambda x: x.replace("contextual_parametric_", ""))
    sns.heatmap(rob_masked,cmap=sns.diverging_palette(260, 30, s=100, center='light', as_cmap=True), vmin=-.1, vmax=.1,center=0, cbar=False,annot=rob_df*100, fmt=".1f", square=True, linewidth=1, linecolor="gray", ax=ax)
    x= np.arange(rob_masked.shape[1]+1)
    y= np.arange(rob_masked.shape[0]+1)
    pvalues = pvalues.map(lambda x: x if x <0.05 else 1)
    zm = np.ma.masked_less(pvalues.loc[rob_df.index].values, 0.05)
    ax.pcolor(x, y, zm , cmap="Grays_r", alpha=.02)
    ax.vlines([0, 3, 6, 9], *ax.get_ylim(), lw=1, color="black")
    ax.hlines([0, 5], *ax.get_xlim(), lw=1, color="black")
    ax.hlines([0, 1, 2, 16,20,27], *ax.get_ylim(), lw=1, color="black")
    ax.set_title(rob.capitalize(),fontproperties=bold_font,fontsize=18)
    # if idx//2 == 0: ax.get_xaxis().set_ticks([])
fig.suptitle("Instruction", fontproperties=bold_font, y=.93,fontsize=20)
plt.subplots_adjust(wspace=.05, hspace=0)
#fig.tight_layout()

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/robustness_inst_tasks_heatmap.pdf", bbox_inches="tight")

In [None]:
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(15,15))
category_mapping = {"Exp. Domain": "Exp", "Exp. Specialization": "ExpLevel", "Education": "Ed"}
fid_dfs = {}
# plt.subplots_adjust(wspace=.15, hspace=.001)
for idx, fid in enumerate(["Exp. Domain", "Exp. Specialization", "Education"]):
    ax = axes[idx]
    fid_df = pd.DataFrame(columns=MODEL_ORDER)
    pvalues =  pd.DataFrame(columns=MODEL_ORDER)
    for dataset, metrics in all_results.items():
        fid_df.loc[dataset] = all_metrics[dataset].loc[f"Fid_{category_mapping[fid]}"]
        pvalues.loc[dataset] = fidelity_significance[dataset].loc[f"Fid_{category_mapping[fid]}"]
    fid_dfs[fid]=fid_df
    # fid_df.loc["Avg."] = fid_df.mean(0)
    # fid_df["Avg."] = fid_df.mean(1)
    # pvalues.loc["Avg."] = fid_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=0)
    # pvalues["Avg."] = fid_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=1)
    fid_masked = fid_df.mask(pvalues ==False, np.nan)
    # rob_masked = rob_masked.map(lambda x: 0 if x > 0 else (-1 if x < 0 else np.nan))
    fid_masked = fid_masked.rename(columns=lambda x: x.rsplit("-", maxsplit=1)[0])
    fid_masked = fid_masked.rename(index=lambda x: x.replace("contextual_parametric_", ""))
    sns.heatmap(fid_masked,cmap=sns.diverging_palette(260, 30, s=100, center='light', as_cmap=True),center=0, cbar=False,annot=fid_df*100, fmt=".0f", square=True, linewidth=1, linecolor="gray", ax=ax)
    x= np.arange(rob_masked.shape[1]+1)
    y= np.arange(rob_masked.shape[0]+1)
    pvalues = pvalues.map(lambda x: x if x <0.05 else 1)
    zm = np.ma.masked_less(pvalues.loc[fid_df.index].values, 0.05)
    ax.vlines([0, 3, 6, 9], *ax.get_ylim(), lw=1, color="black")
    ax.hlines([0, 1, 2, 16,20,27], *ax.get_ylim(), lw=1, color="black")
    ax.set_title(fid.capitalize(), fontproperties=bold_font, fontsize=18)
    # if idx//2 == 0: ax.get_xaxis().set_ticks([])
fig.suptitle("Instruction", fontproperties=bold_font, fontsize=20,y=.93)
plt.subplots_adjust(wspace=.05, hspace=0)
#fig.tight_layout()

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/fidelity_inst_tasks_heatmap.pdf", bbox_inches="tight")

In [None]:
all_df = pd.DataFrame()
for k, v in all_results.items():
    df = v.copy()
    # df = df[[x for x in MODEL_ORDER if x != "Llama-3.2-3B-Instruct"]]
    df = df.stack().reset_index()
    df.columns = ["persona", "model", "score"]
    df = df[df.persona.isin(EDUCATION_PERSONAS + COLOR_PERSONAS + NAMES + ["empty", "in-expert", "experts", "out-expert", "level1", "level2", "level3" ])]
    df["dataset"] = k
    all_df = pd.concat([all_df, df], axis=0)

In [None]:
all_df["modelDataset"] = all_df["model"] + all_df["dataset"].astype(str)

In [None]:
cats = []
for p in all_df.persona.tolist():
    if p in COLOR_PERSONAS: cats.append("color")
    elif p in NAMES: cats.append("name")
    # elif p in EDUCATION_PERSONAS: cats.append("education")
    else: cats.append(p)

In [None]:
all_df["category"] = cats

In [None]:
md = smf.mixedlm("score ~ C(category, Treatment(reference='empty'))", all_df, groups=all_df["modelDataset"])

In [None]:
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

In [None]:
table = mdf.summary().tables[1]

In [None]:
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")

In [None]:
ax = plot_errorbars(table, (6,4))

In [None]:
marg, cond = compute_r2(mdf, all_df)

In [None]:
marg, cond, marg/(1- cond + marg)

In [None]:
large_only = all_df[all_df.model.isin([MODEL_ORDER[5], MODEL_ORDER[-1]])]

In [None]:
md = smf.mixedlm("score ~ C(category, Treatment(reference='empty'))", large_only, groups=large_only["modelDataset"])

In [None]:
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

In [None]:
table = mdf.summary().tables[1]
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")

In [None]:
ax = plot_errorbars(table, (6,4))

#### Scale effect on OP

In [None]:
sizes =  [1, 2, 3, 1, 2, 4, 1, 2, 4]
size_map = {k:v  for v, k in zip(sizes, MODEL_ORDER)}

In [None]:
all_df["size"] = all_df.model.apply(lambda x: size_map[x]).astype("int")

In [None]:
families = ["gemma", "llama", "qwen"]

In [None]:
all_df["family"] = all_df.model.apply(lambda x: families[MODEL_ORDER.index(x) // 3])

In [None]:
all_df["familyDataset"] = all_df["family"] + all_df["dataset"].astype(str)

In [None]:
expert_df = all_df[all_df["persona"].isin(["in-expert", "empty"])]

In [None]:
over_df = expert_df[expert_df["persona"] == "in-expert"].copy()

In [None]:
over_df.score = expert_df[expert_df["persona"] == "in-expert"].score-  expert_df[expert_df["persona"] == "empty"].score.values

In [None]:
md = smf.mixedlm("score ~ size", over_df, groups=over_df["familyDataset"])

In [None]:
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

#### Scale effect on Robustness (color)

In [None]:
color_rob = all_df.drop_duplicates("modelDataset")

color_rob.loc[:,"score"] = [robs_df["color"].loc[row["dataset"], row["model"]] for _, row  in color_rob.iterrows()]

md = smf.mixedlm("score ~ size", color_rob, groups=color_rob["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Robustness (name)

In [None]:
name_rob = all_df.drop_duplicates("modelDataset")

name_rob.loc[:,"score"] = [robs_df["name"].loc[row["dataset"], row["model"]] for _, row  in name_rob.iterrows()]

md = smf.mixedlm("score ~ size", name_rob, groups=name_rob["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Fidelity (education)

In [None]:
education_fid = all_df.drop_duplicates("modelDataset")

education_fid.loc[:,"score"] = [fid_dfs["Education"].loc[row["dataset"], row["model"]] for _, row  in education_fid.iterrows()]

md = smf.mixedlm("score ~ size", education_fid, groups=education_fid["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Fidelity (expertise)

In [None]:
expert_fid = all_df.drop_duplicates("modelDataset")

expert_fid.loc[:,"score"] = [fid_dfs["Exp. Domain"].loc[row["dataset"], row["model"]] for _, row  in expert_fid.iterrows()]

md = smf.mixedlm("score ~ size", expert_fid, groups=expert_fid["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Fidelity (expertise level)

In [None]:
expert_level_fid = all_df.drop_duplicates("modelDataset")

expert_level_fid.loc[:,"score"] = [fid_dfs["Exp. Specialization"].loc[row["dataset"], row["model"]] for _, row  in expert_level_fid.iterrows()]

md = smf.mixedlm("score ~ 0+size", expert_level_fid, groups=expert_level_fid["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Effect of domain matchiness

In [None]:
levels = {"out-expert": 1, "experts": 2, "in-expert": 3}

domain_df = all_df[all_df.persona.isin(levels.keys())].copy()

domain_df.persona = domain_df.persona.apply(lambda x:  levels[x])

np.unique(domain_df.persona)

md = smf.mixedlm("score ~ 0 + persona", domain_df, groups=domain_df["modelDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, domain_df)

marg, cond, marg/(1- cond + marg)

#### Effect of expertise-level

In [None]:
levels = [f"level{i}" for i in range(1,4)]

level_df = all_df[all_df.persona.isin(levels)].copy()

level_df.persona = level_df.persona.str[-1].astype(int)

md = smf.mixedlm("score ~ 0+persona",level_df, groups=level_df["modelDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, level_df)

marg, cond, marg/(1- cond + marg)

#### Effect of education

In [None]:
edu_df = all_df[all_df.persona.str.contains("educ")].copy()

edu_df.persona = edu_df.persona.apply(lambda x: EDUCATION_PERSONAS.index(x)+1).astype(int)

md = smf.mixedlm("score ~ 0 + persona",edu_df, groups=edu_df["modelDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, edu_df)

marg, cond, marg/(1- cond + marg)

### Refine

In [None]:
prefix = "./results/refine"

In [None]:
all_pvalues = pickle.load(open(f"{prefix}/all_pvalues.pkl", "rb"))
all_metrics = pickle.load(open(f"{prefix}/all_metrics.pkl", "rb"))
all_results = pickle.load(open(f"{prefix}/all_results.pkl", "rb"))
fidelity_significance = pickle.load(open(f"{prefix}/fidelity_significances.pkl", "rb"))
fidelity_intervals = pickle.load(open(f"{prefix}/fidelity_intervals.pkl", "rb"))


In [None]:
fig, axes = plt.subplots(1, 4, sharex=True, sharey=True, figsize=(20,15), gridspec_kw={})
name_mapping = {"static": "Static\n(e.g., expert in fact-checking)", "level1": "Broad\n(e.g., expert in math)", "level2": "Focused\n(e.g., expert in abstract algebra)", "level3": "Niche\n(e.g., expert in group theory)"}
for idx, expert in enumerate(["static", "level1", "level2", "level3"]):
    ax = axes[idx]
    op_df = pd.DataFrame(columns=MODEL_ORDER)
    pvalues =  pd.DataFrame(columns=MODEL_ORDER)
    if expert=="static":
        for dataset, metrics in all_results.items():
            op_df.loc[dataset] = all_metrics[dataset].loc["OP"]
            df = all_pvalues[dataset]
            pvalues.loc[dataset] = df.loc["in-expert"]
    else:
        for dataset, metrics in all_results.items():
            op_df.loc[dataset] = metrics.loc[expert] - metrics.loc["empty"]
            df = all_pvalues[dataset]
            pvalues.loc[dataset] = df.loc[expert]
    # op_df.loc["Avg."] = op_df.mean(0)
    # op_df["Avg."] = op_df.mean(1)
    # pvalues.loc["Avg."] = op_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=0)
    # pvalues["Avg."] = op_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=1)
    op_masked = op_df.mask(pvalues > .05, np.nan)
    # op_masked = op_masked.map(lambda x: 0 if x > 0 else (-1 if x < 0 else np.nan))
    op_masked = op_masked.rename(columns=lambda x: x.rsplit("-", maxsplit=1)[0])
    op_masked = op_masked.rename(index=lambda x: x.replace("contextual_parametric_", ""))
    sns.heatmap(op_masked, cmap=sns.diverging_palette(260, 30, s=100, center='light', as_cmap=True), vmin=-.1, vmax=.1,center=0, cbar=False,annot=op_df*100, fmt=".1f", square=True, linewidth=1, linecolor="gray", ax=ax)
    x= np.arange(op_masked.shape[1]+1)
    y= np.arange(op_masked.shape[0]+1)
    pvalues = pvalues.map(lambda x: x if x <0.05 else 1)
    zm = np.ma.masked_less(pvalues.loc[op_df.index].values, 0.05)
    ax.pcolor(x, y, zm , cmap="Grays_r", alpha=.02)
    ax.vlines([0, 3, 6, 9], *ax.get_ylim(), lw=1, color="black")
    ax.hlines([0, 1, 2, 16,20,27], *ax.get_ylim(), lw=1, color="black")
    ax.set_title(name_mapping[expert],fontproperties=bold_font,fontsize=18)

    if idx//2 == 0: ax.get_xaxis().set_ticks([])
fig.suptitle("Refine + Instruction", fontproperties=bold_font, y=.93,fontsize=20)
plt.subplots_adjust(wspace=.05, hspace=0)
#fig.tight_layout()

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/expertise_refine_tasks_heatmap.pdf", bbox_inches="tight")

In [None]:
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(10.5,15), gridspec_kw={})
category_mapping = {"color": COLOR_PERSONAS, "name": NAMES}
# plt.subplots_adjust(wspace=.15, hspace=.001)
robs_df = {}
for idx, rob in enumerate(["color", "name"]):
    ax = axes[idx]
    rob_df = pd.DataFrame(columns=MODEL_ORDER)
    pvalues =  pd.DataFrame(columns=MODEL_ORDER)
    for dataset, metrics in all_results.items():
        rob_df.loc[dataset] = all_metrics[dataset].loc[f"WU_{rob}"]
        df = all_pvalues[dataset]
        worst = worst_case_utility(all_results[dataset], category_mapping[rob], return_persona=True)[1]
        for model in MODEL_ORDER:
            pvalues.loc[dataset, model] = df.loc[worst[model], model]
    robs_df[rob] = rob_df
    # rob_df.loc["Avg."] = rob_df.mean(0)
    # rob_df["Avg."] = rob_df.mean(1)
    # pvalues.loc["Avg."] = rob_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=0)
    # pvalues["Avg."] = rob_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=1)
    rob_masked = rob_df.mask(pvalues > .05, np.nan)
    # rob_masked = rob_masked.map(lambda x: 0 if x > 0 else (-1 if x < 0 else np.nan))
    rob_masked = rob_masked.rename(columns=lambda x: x.rsplit("-", maxsplit=1)[0])
    rob_masked = rob_masked.rename(index=lambda x: x.replace("contextual_parametric_", ""))
    sns.heatmap(rob_masked,cmap=sns.diverging_palette(260, 30, s=100, center='light', as_cmap=True), vmin=-.1, vmax=.1,center=0, cbar=False,annot=rob_df*100, fmt=".1f", square=True, linewidth=1, linecolor="gray", ax=ax)
    x= np.arange(rob_masked.shape[1]+1)
    y= np.arange(rob_masked.shape[0]+1)
    pvalues = pvalues.map(lambda x: x if x <0.05 else 1)
    zm = np.ma.masked_less(pvalues.loc[rob_df.index].values, 0.05)
    ax.pcolor(x, y, zm , cmap="Grays_r", alpha=.02)
    ax.vlines([0, 3, 6, 9], *ax.get_ylim(), lw=1, color="black")
    ax.hlines([0, 5], *ax.get_xlim(), lw=1, color="black")
    ax.hlines([0, 1, 2, 16,20,27], *ax.get_ylim(), lw=1, color="black")
    ax.set_title(rob.capitalize(),fontproperties=bold_font,fontsize=18)
    # if idx//2 == 0: ax.get_xaxis().set_ticks([])
fig.suptitle("Refine + Instruction", fontproperties=bold_font, y=.93,fontsize=20)
plt.subplots_adjust(wspace=.05, hspace=0)
#fig.tight_layout()

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/robustness_refine_tasks_heatmap.pdf", bbox_inches="tight")

In [None]:
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(15,15))
category_mapping = {"Exp. Domain": "Exp", "Exp. Specialization": "ExpLevel", "Education": "Ed"}
fid_dfs = {}
# plt.subplots_adjust(wspace=.15, hspace=.001)
for idx, fid in enumerate(["Exp. Domain", "Exp. Specialization", "Education"]):
    ax = axes[idx]
    fid_df = pd.DataFrame(columns=MODEL_ORDER)
    pvalues =  pd.DataFrame(columns=MODEL_ORDER)
    for dataset, metrics in all_results.items():
        fid_df.loc[dataset] = all_metrics[dataset].loc[f"Fid_{category_mapping[fid]}"]
        pvalues.loc[dataset] = fidelity_significance[dataset].loc[f"Fid_{category_mapping[fid]}"]
    fid_dfs[fid]=fid_df
    # fid_df.loc["Avg."] = fid_df.mean(0)
    # fid_df["Avg."] = fid_df.mean(1)
    # pvalues.loc["Avg."] = fid_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=0)
    # pvalues["Avg."] = fid_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=1)
    fid_masked = fid_df.mask(pvalues ==False, np.nan)
    # rob_masked = rob_masked.map(lambda x: 0 if x > 0 else (-1 if x < 0 else np.nan))
    fid_masked = fid_masked.rename(columns=lambda x: x.rsplit("-", maxsplit=1)[0])
    fid_masked = fid_masked.rename(index=lambda x: x.replace("contextual_parametric_", ""))
    sns.heatmap(fid_masked,cmap=sns.diverging_palette(260, 30, s=100, center='light', as_cmap=True),center=0, cbar=False,annot=fid_df*100, fmt=".0f", square=True, linewidth=1, linecolor="gray", ax=ax)
    x= np.arange(rob_masked.shape[1]+1)
    y= np.arange(rob_masked.shape[0]+1)
    pvalues = pvalues.map(lambda x: x if x <0.05 else 1)
    zm = np.ma.masked_less(pvalues.loc[fid_df.index].values, 0.05)
    ax.vlines([0, 3, 6, 9], *ax.get_ylim(), lw=1, color="black")
    ax.hlines([0, 1, 2, 16,20,27], *ax.get_ylim(), lw=1, color="black")
    ax.set_title(fid.capitalize(), fontproperties=bold_font, fontsize=18)
    # if idx//2 == 0: ax.get_xaxis().set_ticks([])
fig.suptitle("Refine + Instruction", fontproperties=bold_font, fontsize=20,y=.93)
plt.subplots_adjust(wspace=.05, hspace=0)
#fig.tight_layout()

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/fidelity_refine_tasks_heatmap.pdf", bbox_inches="tight")

In [None]:
all_df = pd.DataFrame()
for k, v in all_results.items():
    df = v.copy()
    # df = df[[x for x in MODEL_ORDER if x != "Llama-3.2-3B-Instruct"]]
    df = df.stack().reset_index()
    df.columns = ["persona", "model", "score"]
    df = df[df.persona.isin(EDUCATION_PERSONAS + COLOR_PERSONAS + NAMES + ["empty", "in-expert", "experts", "out-expert", "level1", "level2", "level3" ])]
    df["dataset"] = k
    all_df = pd.concat([all_df, df], axis=0)

In [None]:
all_df["modelDataset"] = all_df["model"] + all_df["dataset"].astype(str)

In [None]:
cats = []
for p in all_df.persona.tolist():
    if p in COLOR_PERSONAS: cats.append("color")
    elif p in NAMES: cats.append("name")
    # elif p in EDUCATION_PERSONAS: cats.append("education")
    else: cats.append(p)

In [None]:
all_df["category"] = cats

In [None]:
md = smf.mixedlm("score ~ C(category, Treatment(reference='empty'))", all_df, groups=all_df["modelDataset"])

In [None]:
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

In [None]:
table = mdf.summary().tables[1]

In [None]:
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")

In [None]:
ax = plot_errorbars(table, (6,4))

In [None]:
marg, cond = compute_r2(mdf, all_df)

In [None]:
marg, cond, marg/(1- cond + marg)

In [None]:
large_only = all_df[all_df.model.isin([MODEL_ORDER[5], MODEL_ORDER[-1]])]

In [None]:
md = smf.mixedlm("score ~ C(category, Treatment(reference='empty'))", large_only, groups=large_only["modelDataset"])

In [None]:
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

In [None]:
table = mdf.summary().tables[1]
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")

In [None]:
ax = plot_errorbars(table, (6,4))

#### Scale effect on OP

In [None]:
sizes =  [1, 2, 3, 1, 2, 4, 1, 2, 4]
size_map = {k:v  for v, k in zip(sizes, MODEL_ORDER)}

In [None]:
all_df["size"] = all_df.model.apply(lambda x: size_map[x]).astype("int")

In [None]:
families = ["gemma", "llama", "qwen"]

In [None]:
all_df["family"] = all_df.model.apply(lambda x: families[MODEL_ORDER.index(x) // 3])

In [None]:
all_df["familyDataset"] = all_df["family"] + all_df["dataset"].astype(str)

In [None]:
expert_df = all_df[all_df["persona"].isin(["in-expert", "empty"])]

In [None]:
over_df = expert_df[expert_df["persona"] == "in-expert"].copy()

In [None]:
over_df.score = expert_df[expert_df["persona"] == "in-expert"].score-  expert_df[expert_df["persona"] == "empty"].score.values

In [None]:
md = smf.mixedlm("score ~ size", over_df, groups=over_df["familyDataset"])

In [None]:
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

#### Scale effect on Robustness (color)

In [None]:
color_rob = all_df.drop_duplicates("modelDataset")

color_rob.loc[:,"score"] = [robs_df["color"].loc[row["dataset"], row["model"]] for _, row  in color_rob.iterrows()]

md = smf.mixedlm("score ~ size", color_rob, groups=color_rob["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Robustness (name)

In [None]:
name_rob = all_df.drop_duplicates("modelDataset")

name_rob.loc[:,"score"] = [robs_df["name"].loc[row["dataset"], row["model"]] for _, row  in name_rob.iterrows()]

md = smf.mixedlm("score ~ size", name_rob, groups=name_rob["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Fidelity (education)

In [None]:
education_fid = all_df.drop_duplicates("modelDataset")

education_fid.loc[:,"score"] = [fid_dfs["Education"].loc[row["dataset"], row["model"]] for _, row  in education_fid.iterrows()]

md = smf.mixedlm("score ~ size", education_fid, groups=education_fid["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Fidelity (expertise)

In [None]:
expert_fid = all_df.drop_duplicates("modelDataset")

expert_fid.loc[:,"score"] = [fid_dfs["Exp. Domain"].loc[row["dataset"], row["model"]] for _, row  in expert_fid.iterrows()]

md = smf.mixedlm("score ~ size", expert_fid, groups=expert_fid["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Fidelity (expertise level)

In [None]:
expert_level_fid = all_df.drop_duplicates("modelDataset")

expert_level_fid.loc[:,"score"] = [fid_dfs["Exp. Specialization"].loc[row["dataset"], row["model"]] for _, row  in expert_level_fid.iterrows()]

md = smf.mixedlm("score ~ 0+size", expert_level_fid, groups=expert_level_fid["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Effect of domain matchiness

In [None]:
levels = {"out-expert": 1, "experts": 2, "in-expert": 3}

domain_df = all_df[all_df.persona.isin(levels.keys())].copy()

domain_df.persona = domain_df.persona.apply(lambda x:  levels[x])

np.unique(domain_df.persona)

md = smf.mixedlm("score ~ 0 + persona", domain_df, groups=domain_df["modelDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, domain_df)

marg, cond, marg/(1- cond + marg)

#### Effect of expertise-level

In [None]:
levels = [f"level{i}" for i in range(1,4)]

level_df = all_df[all_df.persona.isin(levels)].copy()

level_df.persona = level_df.persona.str[-1].astype(int)

md = smf.mixedlm("score ~ 0+persona",level_df, groups=level_df["modelDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, level_df)

marg, cond, marg/(1- cond + marg)

#### Effect of education

In [None]:
edu_df = all_df[all_df.persona.str.contains("educ")].copy()

edu_df.persona = edu_df.persona.apply(lambda x: EDUCATION_PERSONAS.index(x)+1).astype(int)

md = smf.mixedlm("score ~ 0 + persona",edu_df, groups=edu_df["modelDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, edu_df)

marg, cond, marg/(1- cond + marg)

### Refine basic

In [None]:
prefix = "./results/refine_basic"

In [None]:
all_pvalues = pickle.load(open(f"{prefix}/all_pvalues.pkl", "rb"))
all_metrics = pickle.load(open(f"{prefix}/all_metrics.pkl", "rb"))
all_results = pickle.load(open(f"{prefix}/all_results.pkl", "rb"))
fidelity_significance = pickle.load(open(f"{prefix}/fidelity_significances.pkl", "rb"))
fidelity_intervals = pickle.load(open(f"{prefix}/fidelity_intervals.pkl", "rb"))


In [None]:
fig, axes = plt.subplots(1, 4, sharex=True, sharey=True, figsize=(20,15), gridspec_kw={})
name_mapping = {"static": "Static\n(e.g., expert in fact-checking)", "level1": "Broad\n(e.g., expert in math)", "level2": "Focused\n(e.g., expert in abstract algebra)", "level3": "Niche\n(e.g., expert in group theory)"}
for idx, expert in enumerate(["static", "level1", "level2", "level3"]):
    ax = axes[idx]
    op_df = pd.DataFrame(columns=MODEL_ORDER)
    pvalues =  pd.DataFrame(columns=MODEL_ORDER)
    if expert=="static":
        for dataset, metrics in all_results.items():
            op_df.loc[dataset] = all_metrics[dataset].loc["OP"]
            df = all_pvalues[dataset]
            pvalues.loc[dataset] = df.loc["in-expert"]
    else:
        for dataset, metrics in all_results.items():
            op_df.loc[dataset] = metrics.loc[expert] - metrics.loc["empty"]
            df = all_pvalues[dataset]
            pvalues.loc[dataset] = df.loc[expert]
    # op_df.loc["Avg."] = op_df.mean(0)
    # op_df["Avg."] = op_df.mean(1)
    # pvalues.loc["Avg."] = op_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=0)
    # pvalues["Avg."] = op_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=1)
    op_masked = op_df.mask(pvalues > .05, np.nan)
    # op_masked = op_masked.map(lambda x: 0 if x > 0 else (-1 if x < 0 else np.nan))
    op_masked = op_masked.rename(columns=lambda x: x.rsplit("-", maxsplit=1)[0])
    op_masked = op_masked.rename(index=lambda x: x.replace("contextual_parametric_", ""))
    sns.heatmap(op_masked, cmap=sns.diverging_palette(260, 30, s=100, center='light', as_cmap=True), vmin=-.1, vmax=.1,center=0, cbar=False,annot=op_df*100, fmt=".1f", square=True, linewidth=1, linecolor="gray", ax=ax)
    x= np.arange(op_masked.shape[1]+1)
    y= np.arange(op_masked.shape[0]+1)
    pvalues = pvalues.map(lambda x: x if x <0.05 else 1)
    zm = np.ma.masked_less(pvalues.loc[op_df.index].values, 0.05)
    ax.pcolor(x, y, zm , cmap="Grays_r", alpha=.02)
    ax.vlines([0, 3, 6, 9], *ax.get_ylim(), lw=1, color="black")
    ax.hlines([0, 1, 2, 16,20,27], *ax.get_ylim(), lw=1, color="black")
    ax.set_title(name_mapping[expert],fontproperties=bold_font,fontsize=18)

    if idx//2 == 0: ax.get_xaxis().set_ticks([])
fig.suptitle("Refine", fontproperties=bold_font, y=.93,fontsize=20)
plt.subplots_adjust(wspace=.05, hspace=0)
#fig.tight_layout()

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/expertise_refine_basic_tasks_heatmap.pdf", bbox_inches="tight")

In [None]:
fig, axes = plt.subplots(1, 2, sharex=True, sharey=True, figsize=(10.5,15), gridspec_kw={})
category_mapping = {"color": COLOR_PERSONAS, "name": NAMES}
# plt.subplots_adjust(wspace=.15, hspace=.001)
robs_df = {}
for idx, rob in enumerate(["color", "name"]):
    ax = axes[idx]
    rob_df = pd.DataFrame(columns=MODEL_ORDER)
    pvalues =  pd.DataFrame(columns=MODEL_ORDER)
    for dataset, metrics in all_results.items():
        rob_df.loc[dataset] = all_metrics[dataset].loc[f"WU_{rob}"]
        df = all_pvalues[dataset]
        worst = worst_case_utility(all_results[dataset], category_mapping[rob], return_persona=True)[1]
        for model in MODEL_ORDER:
            pvalues.loc[dataset, model] = df.loc[worst[model], model]
    robs_df[rob] = rob_df
    # rob_df.loc["Avg."] = rob_df.mean(0)
    # rob_df["Avg."] = rob_df.mean(1)
    # pvalues.loc["Avg."] = rob_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=0)
    # pvalues["Avg."] = rob_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=1)
    rob_masked = rob_df.mask(pvalues > .05, np.nan)
    # rob_masked = rob_masked.map(lambda x: 0 if x > 0 else (-1 if x < 0 else np.nan))
    rob_masked = rob_masked.rename(columns=lambda x: x.rsplit("-", maxsplit=1)[0])
    rob_masked = rob_masked.rename(index=lambda x: x.replace("contextual_parametric_", ""))
    sns.heatmap(rob_masked,cmap=sns.diverging_palette(260, 30, s=100, center='light', as_cmap=True), vmin=-.1, vmax=.1,center=0, cbar=False,annot=rob_df*100, fmt=".1f", square=True, linewidth=1, linecolor="gray", ax=ax)
    x= np.arange(rob_masked.shape[1]+1)
    y= np.arange(rob_masked.shape[0]+1)
    pvalues = pvalues.map(lambda x: x if x <0.05 else 1)
    zm = np.ma.masked_less(pvalues.loc[rob_df.index].values, 0.05)
    ax.pcolor(x, y, zm , cmap="Grays_r", alpha=.02)
    ax.vlines([0, 3, 6, 9], *ax.get_ylim(), lw=1, color="black")
    ax.hlines([0, 5], *ax.get_xlim(), lw=1, color="black")
    ax.hlines([0, 1, 2, 16,20,27], *ax.get_ylim(), lw=1, color="black")
    ax.set_title(rob.capitalize(),fontproperties=bold_font,fontsize=18)
    # if idx//2 == 0: ax.get_xaxis().set_ticks([])
fig.suptitle("Refine", fontproperties=bold_font, y=.93,fontsize=20)
plt.subplots_adjust(wspace=.05, hspace=0)
#fig.tight_layout()

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/robustness_refine_basic_tasks_heatmap.pdf", bbox_inches="tight")

In [None]:
fig, axes = plt.subplots(1, 3, sharey=True, figsize=(15,15))
category_mapping = {"Exp. Domain": "Exp", "Exp. Specialization": "ExpLevel", "Education": "Ed"}
fid_dfs = {}
# plt.subplots_adjust(wspace=.15, hspace=.001)
for idx, fid in enumerate(["Exp. Domain", "Exp. Specialization", "Education"]):
    ax = axes[idx]
    fid_df = pd.DataFrame(columns=MODEL_ORDER)
    pvalues =  pd.DataFrame(columns=MODEL_ORDER)
    for dataset, metrics in all_results.items():
        fid_df.loc[dataset] = all_metrics[dataset].loc[f"Fid_{category_mapping[fid]}"]
        pvalues.loc[dataset] = fidelity_significance[dataset].loc[f"Fid_{category_mapping[fid]}"]
    fid_dfs[fid]=fid_df
    # fid_df.loc["Avg."] = fid_df.mean(0)
    # fid_df["Avg."] = fid_df.mean(1)
    # pvalues.loc["Avg."] = fid_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=0)
    # pvalues["Avg."] = fid_df.apply(lambda x: wilcoxon(x.values).pvalue,axis=1)
    fid_masked = fid_df.mask(pvalues ==False, np.nan)
    # rob_masked = rob_masked.map(lambda x: 0 if x > 0 else (-1 if x < 0 else np.nan))
    fid_masked = fid_masked.rename(columns=lambda x: x.rsplit("-", maxsplit=1)[0])
    fid_masked = fid_masked.rename(index=lambda x: x.replace("contextual_parametric_", ""))
    sns.heatmap(fid_masked,cmap=sns.diverging_palette(260, 30, s=100, center='light', as_cmap=True),center=0, cbar=False,annot=fid_df*100, fmt=".0f", square=True, linewidth=1, linecolor="gray", ax=ax)
    x= np.arange(rob_masked.shape[1]+1)
    y= np.arange(rob_masked.shape[0]+1)
    pvalues = pvalues.map(lambda x: x if x <0.05 else 1)
    zm = np.ma.masked_less(pvalues.loc[fid_df.index].values, 0.05)
    ax.vlines([0, 3, 6, 9], *ax.get_ylim(), lw=1, color="black")
    ax.hlines([0, 1, 2, 16,20,27], *ax.get_ylim(), lw=1, color="black")
    ax.set_title(fid.capitalize(), fontproperties=bold_font, fontsize=18)
    # if idx//2 == 0: ax.get_xaxis().set_ticks([])
fig.suptitle("Refine", fontproperties=bold_font, fontsize=20,y=.93)
plt.subplots_adjust(wspace=.05, hspace=0)
#fig.tight_layout()

In [None]:
#ax.get_figure().savefig("../persona_performance_paper/media/fidelity_refine_basic_tasks_heatmap.pdf", bbox_inches="tight")

In [None]:
all_df = pd.DataFrame()
for k, v in all_results.items():
    df = v.copy()
    # df = df[[x for x in MODEL_ORDER if x != "Llama-3.2-3B-Instruct"]]
    df = df.stack().reset_index()
    df.columns = ["persona", "model", "score"]
    df = df[df.persona.isin(EDUCATION_PERSONAS + COLOR_PERSONAS + NAMES + ["empty", "in-expert", "experts", "out-expert", "level1", "level2", "level3" ])]
    df["dataset"] = k
    all_df = pd.concat([all_df, df], axis=0)

In [None]:
all_df["modelDataset"] = all_df["model"] + all_df["dataset"].astype(str)

In [None]:
cats = []
for p in all_df.persona.tolist():
    if p in COLOR_PERSONAS: cats.append("color")
    elif p in NAMES: cats.append("name")
    # elif p in EDUCATION_PERSONAS: cats.append("education")
    else: cats.append(p)

In [None]:
all_df["category"] = cats

In [None]:
md = smf.mixedlm("score ~ C(category, Treatment(reference='empty'))", all_df, groups=all_df["modelDataset"])

In [None]:
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

In [None]:
table = mdf.summary().tables[1]

In [None]:
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")

In [None]:
ax = plot_errorbars(table, (6,4))

In [None]:
marg, cond = compute_r2(mdf, all_df)

In [None]:
marg, cond, marg/(1- cond + marg)

In [None]:
large_only = all_df[all_df.model.isin([MODEL_ORDER[5], MODEL_ORDER[-1]])]

In [None]:
md = smf.mixedlm("score ~ C(category, Treatment(reference='empty'))", large_only, groups=large_only["modelDataset"])

In [None]:
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

In [None]:
table = mdf.summary().tables[1]
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")

In [None]:
ax = plot_errorbars(table, (6,4))

#### Scale effect on OP

In [None]:
sizes =  [1, 2, 3, 1, 2, 4, 1, 2, 4]
size_map = {k:v  for v, k in zip(sizes, MODEL_ORDER)}

In [None]:
all_df["size"] = all_df.model.apply(lambda x: size_map[x]).astype("int")

In [None]:
families = ["gemma", "llama", "qwen"]

In [None]:
all_df["family"] = all_df.model.apply(lambda x: families[MODEL_ORDER.index(x) // 3])

In [None]:
all_df["familyDataset"] = all_df["family"] + all_df["dataset"].astype(str)

In [None]:
expert_df = all_df[all_df["persona"].isin(["in-expert", "empty"])]

In [None]:
over_df = expert_df[expert_df["persona"] == "in-expert"].copy()

In [None]:
over_df.score = expert_df[expert_df["persona"] == "in-expert"].score-  expert_df[expert_df["persona"] == "empty"].score.values

In [None]:
md = smf.mixedlm("score ~ size", over_df, groups=over_df["familyDataset"])

In [None]:
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

#### Scale effect on Robustness (color)

In [None]:
color_rob = all_df.drop_duplicates("modelDataset")

color_rob.loc[:,"score"] = [robs_df["color"].loc[row["dataset"], row["model"]] for _, row  in color_rob.iterrows()]

md = smf.mixedlm("score ~ size", color_rob, groups=color_rob["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Robustness (name)

In [None]:
name_rob = all_df.drop_duplicates("modelDataset")

name_rob.loc[:,"score"] = [robs_df["name"].loc[row["dataset"], row["model"]] for _, row  in name_rob.iterrows()]

md = smf.mixedlm("score ~ size", name_rob, groups=name_rob["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Fidelity (education)

In [None]:
education_fid = all_df.drop_duplicates("modelDataset")

education_fid.loc[:,"score"] = [fid_dfs["Education"].loc[row["dataset"], row["model"]] for _, row  in education_fid.iterrows()]

md = smf.mixedlm("score ~ size", education_fid, groups=education_fid["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Fidelity (expertise)

In [None]:
expert_fid = all_df.drop_duplicates("modelDataset")

expert_fid.loc[:,"score"] = [fid_dfs["Exp. Domain"].loc[row["dataset"], row["model"]] for _, row  in expert_fid.iterrows()]

md = smf.mixedlm("score ~ size", expert_fid, groups=expert_fid["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Scale effect on Fidelity (expertise level)

In [None]:
expert_level_fid = all_df.drop_duplicates("modelDataset")

expert_level_fid.loc[:,"score"] = [fid_dfs["Exp. Specialization"].loc[row["dataset"], row["model"]] for _, row  in expert_level_fid.iterrows()]

md = smf.mixedlm("score ~ 0+size", expert_level_fid, groups=expert_level_fid["familyDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, all_df)

marg, cond, marg/(1- cond + marg)

#### Effect of domain matchiness

In [None]:
levels = {"out-expert": 1, "experts": 2, "in-expert": 3}

domain_df = all_df[all_df.persona.isin(levels.keys())].copy()

domain_df.persona = domain_df.persona.apply(lambda x:  levels[x])

np.unique(domain_df.persona)

md = smf.mixedlm("score ~ 0 + persona", domain_df, groups=domain_df["modelDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, domain_df)

marg, cond, marg/(1- cond + marg)

#### Effect of expertise-level

In [None]:
levels = [f"level{i}" for i in range(1,4)]

level_df = all_df[all_df.persona.isin(levels)].copy()

level_df.persona = level_df.persona.str[-1].astype(int)

md = smf.mixedlm("score ~ 0+persona",level_df, groups=level_df["modelDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, level_df)

marg, cond, marg/(1- cond + marg)

#### Effect of education

In [None]:
edu_df = all_df[all_df.persona.str.contains("educ")].copy()

edu_df.persona = edu_df.persona.apply(lambda x: EDUCATION_PERSONAS.index(x)+1).astype(int)

md = smf.mixedlm("score ~ 0 + persona",edu_df, groups=edu_df["modelDataset"])

mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))

marg, cond = compute_r2(mdf, edu_df)

marg, cond, marg/(1- cond + marg)

In [None]:
name_map = {
    "./results": "baseline",
    "./results/instruction": "instruction",
    "./results/refine": "refine",
    "results/refine_basic": "refine_basic"
}

In [None]:
mitigation_metrics_df = pd.DataFrame()
for prefix in ["./results", "./results/instruction", "./results/refine", "results/refine_basic"]:
    metrics = pickle.load(open(f"{prefix}/all_metrics.pkl", "rb"))
    for task, df in metrics.items():
        ms = df.T.stack().reset_index().copy()
        ms["method"] = name_map[prefix]
        ms["task"] = task
        ms.columns = ["model", "metric", "score", "method", "task"]
        mitigation_metrics_df =  pd.concat([mitigation_metrics_df, ms], axis=0)

In [None]:
mitigation_metrics_df["modelTask"] = mitigation_metrics_df["model"] + mitigation_metrics_df["task"].astype(str)

In [None]:
metrics = mitigation_metrics_df.metric.drop_duplicates().values[1:]; metrics

In [None]:
mitigation_metrics_df

In [None]:
for m in metrics:
    print(f"Analying {m}.")
    data = mitigation_metrics_df[mitigation_metrics_df.metric==m]
    print(data.groupby("method").score.mean())
    md = smf.mixedlm("score ~ C(method, Treatment(reference='baseline'))", data, groups=data["modelTask"])
    mdf = md.fit()
    display(mdf.summary().tables[0])
    coefs = mdf.summary().tables[1]
    coefs["Coef."] = coefs["Coef."].astype("float")
    display(coefs.sort_values("Coef."))
    table = mdf.summary().tables[1]
    table = table.loc[[x for x in table.index if "reference" in x]].astype("float")
    display(plot_errorbars(table,figsize=(8,4)))
    marg, cond = compute_r2(mdf, data)
    print(marg, cond, marg/(1- cond + marg))
    print("=======================================================")
    print()

In [None]:
large_models = [MODEL_ORDER[5]] + [MODEL_ORDER[-1]]

In [None]:
mitigation_metrics_from_large = mitigation_metrics_df[mitigation_metrics_df.model.isin(large_models)]

In [None]:
for m in metrics:
    print(f"Analying {m}.")
    data = mitigation_metrics_from_large[mitigation_metrics_from_large.metric==m]
    print(data.groupby("method").score.mean())
    md = smf.mixedlm("score ~ C(method, Treatment(reference='baseline'))", data, groups=data["modelTask"])
    mdf = md.fit()
    display(mdf.summary().tables[0])
    coefs = mdf.summary().tables[1]
    coefs["Coef."] = coefs["Coef."].astype("float")
    display(coefs.sort_values("Coef."))
    table = mdf.summary().tables[1]
    table = table.loc[[x for x in table.index if "reference" in x]].astype("float")
    display(plot_errorbars(table,figsize=(8,4)))
    marg, cond = compute_r2(mdf, data)
    print(marg, cond, marg/(1- cond + marg))
    print("=======================================================")
    print()

### Effect on dynamic personas

In [None]:
expert1_performance = pd.DataFrame()
expert2_performance = pd.DataFrame()
expert3_performance = pd.DataFrame()
for prefix in ["./results", "./results/instruction", "./results/refine", "results/refine_basic"]:
    results = pickle.load(open(f"{prefix}/all_results.pkl", "rb"))
    for task, df in results.items():
        expert_dfs = [df.loc[[level]] for level in ["level1", "level2", "level3"]]
        for idx, expert_df in enumerate(expert_dfs):
            expert_df = expert_df -  df.loc[["empty"]].values
            expert_df = expert_df.T.stack().reset_index().copy()
            expert_df["method"] = name_map[prefix]
            expert_df["task"] = task
            expert_df.columns = ["model", "persona", "score", "method", "task"]
            expert_dfs[idx] = expert_df
        expert1_performance =  pd.concat([expert1_performance, expert_dfs[0]], axis=0)
        expert2_performance =  pd.concat([expert2_performance, expert_dfs[1]], axis=0)
        expert3_performance =  pd.concat([expert3_performance, expert_dfs[2]], axis=0)

In [None]:
expert1_performance["modelTask"] = expert1_performance["model"] + expert1_performance["task"].astype(str)
expert2_performance["modelTask"] = expert2_performance["model"] + expert2_performance["task"].astype(str)
expert3_performance["modelTask"] = expert3_performance["model"] + expert3_performance["task"].astype(str)

In [None]:
data = expert1_performance
print(data.groupby("method").score.mean())
md = smf.mixedlm("score ~ C(method, Treatment(reference='baseline'))", data, groups=data["modelTask"])
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))
table = mdf.summary().tables[1]
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")
display(plot_errorbars(table,figsize=(8,4)))
marg, cond = compute_r2(mdf, data)
print(marg, cond, marg/(1- cond + marg))
print("=======================================================")
print()

In [None]:
data = expert2_performance
print(data.groupby("method").score.mean())
md = smf.mixedlm("score ~ C(method, Treatment(reference='baseline'))", data, groups=data["modelTask"])
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))
table = mdf.summary().tables[1]
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")
display(plot_errorbars(table,figsize=(8,4)))
marg, cond = compute_r2(mdf, data)
print(marg, cond, marg/(1- cond + marg))
print("=======================================================")
print()

In [None]:
data = expert3_performance
print(data.groupby("method").score.mean())
md = smf.mixedlm("score ~ C(method, Treatment(reference='baseline'))", data, groups=data["modelTask"])
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))
table = mdf.summary().tables[1]
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")
display(plot_errorbars(table,figsize=(8,4)))
marg, cond = compute_r2(mdf, data)
print(marg, cond, marg/(1- cond + marg))
print("=======================================================")
print()

In [None]:
expert1_performance_large = expert1_performance[expert1_performance.model.isin(large_models)]
expert2_performance_large = expert2_performance[expert2_performance.model.isin(large_models)]
expert3_performance_large = expert3_performance[expert3_performance.model.isin(large_models)]

In [None]:
data = expert1_performance_large
print(data.groupby("method").score.mean())
md = smf.mixedlm("score ~ C(method, Treatment(reference='baseline'))", data, groups=data["modelTask"])
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))
table = mdf.summary().tables[1]
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")
display(plot_errorbars(table,figsize=(8,4)))
marg, cond = compute_r2(mdf, data)
print(marg, cond, marg/(1- cond + marg))
print("=======================================================")
print()

In [None]:
data = expert2_performance_large
print(data.groupby("method").score.mean())
md = smf.mixedlm("score ~ C(method, Treatment(reference='baseline'))", data, groups=data["modelTask"])
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))
table = mdf.summary().tables[1]
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")
display(plot_errorbars(table,figsize=(8,4)))
marg, cond = compute_r2(mdf, data)
print(marg, cond, marg/(1- cond + marg))
print("=======================================================")
print()

In [None]:
data = expert3_performance_large
print(data.groupby("method").score.mean())
md = smf.mixedlm("score ~ C(method, Treatment(reference='baseline'))", data, groups=data["modelTask"])
mdf = md.fit()
display(mdf.summary().tables[0])
coefs = mdf.summary().tables[1]
coefs["Coef."] = coefs["Coef."].astype("float")
display(coefs.sort_values("Coef."))
table = mdf.summary().tables[1]
table = table.loc[[x for x in table.index if "reference" in x]].astype("float")
display(plot_errorbars(table,figsize=(8,4)))
marg, cond = compute_r2(mdf, data)
print(marg, cond, marg/(1- cond + marg))
print("=======================================================")
print()

## Combined mitigation effect

In [None]:
over_df = mitigation_metrics_df[mitigation_metrics_df.metric == "OP"]
name_rob = mitigation_metrics_df[mitigation_metrics_df.metric == "WU_name"]
color_rob = mitigation_metrics_df[mitigation_metrics_df.metric == "WU_color"]
expert_fid = mitigation_metrics_df[mitigation_metrics_df.metric == "Fid_Exp"]
education_fid = mitigation_metrics_df[mitigation_metrics_df.metric == "Fid_Ed"]
expert_level_fid = mitigation_metrics_df[mitigation_metrics_df.metric == "Fid_ExpLevel"]

In [None]:
mitigation_dfs = [over_df, expert1_performance, expert2_performance, expert3_performance, color_rob, name_rob, expert_fid, education_fid, expert_level_fid]

In [None]:
metric_names = ["Exp. Advant.\n(static)", "Exp. Advant.\n(broad)", "Exp. Advant.\n(focused)", "Exp. Advant.\n(niche)", "Robustness\n(color)", "Robustness\n(name)", "Fidelity\n(domain match)", "Fidelity\n(education)", "Fidelity\n(expertise level)"]

In [None]:
mitigation_map = {
    "baseline": "Base prompt",
    "instruction": "Instruction",
    "refine_basic": "Refine",
    "refine": "Refine + Instruction"
}

In [None]:
for idx, df in enumerate(mitigation_dfs):
    df = df.copy()
    df.method = df.method.astype(pd.CategoricalDtype(categories=list(mitigation_map.keys()), ordered=True))
    df = df.sort_values("method")
    df.method = df.method.apply(lambda x: mitigation_map[x])
    mitigation_dfs[idx] = df

In [None]:
all_coefs = pd.DataFrame()
all_metrics = []
for idx, df in enumerate(mitigation_dfs):
    md = smf.mixedlm("score ~  C(method, Treatment(reference='Base prompt'))", df, groups=df["modelTask"])
    mdf = md.fit()
    coefs = mdf.summary().tables[1]
    coefs["Coef."] = coefs["Coef."].astype("float")
    coefs = coefs.rename(index=lambda x: x.split("T.")[-1].rstrip("]"))
    display(coefs)
    all_metrics.extend(3*[metric_names[idx]])
    all_coefs = pd.concat([all_coefs, coefs.loc[list(mitigation_map.values())[1:]]], axis=0)

In [None]:
table= all_coefs.astype("float")

In [None]:
table["metric"] = all_metrics

In [None]:
table["metric"] = table.metric.astype(pd.CategoricalDtype(categories=metric_names, ordered=True))

In [None]:
table["metric_category"] = table.metric.map(lambda x: x.split("\n")[0]).astype(pd.CategoricalDtype(categories=metric_categories, ordered=True))

In [None]:
table = table.sort_values(["metric_category", "metric"])

In [None]:
table= table.reset_index()

In [None]:
table = table.rename(columns=lambda x: "Method" if x == "index" else x)

In [None]:
# Clean up the metric names
table['metric_clean'] = table['metric']

# Group by metric category
categories = table['metric_category'].unique()
n_categories = len(categories)

# Set up the figure
fig, axes = plt.subplots(n_categories, 1, figsize=(6.5, 6), sharey=False)

if n_categories == 1:
    axes = [axes]  # ensure axes is always iterable

for ax, category in zip(axes, categories):
    subset = table[table['metric_category'] == category]

    # We'll plot by metric_clean on x, and Method within each x as grouped points
    metrics = subset['metric_clean'].unique()
    methods = list(mitigation_map.values())[1:]
    x_locs = range(len(metrics))
    width = 0.2

    # For color/legend
    color_map = {'Instruction': 'tab:blue', 'Refine': 'tab:orange', 'Refine + Instruction': 'tab:green'}

    for i, method in enumerate(methods):
        method_data = subset[subset['Method'] == method]
        for j, metric in enumerate(metrics):
            point = method_data[method_data['metric_clean'] == metric]
            if not point.empty:
                coef = point['Coef.'].values[0]
                lower = point['[0.025'].values[0]
                upper = point['0.975]'].values[0]
                error = [[coef - lower], [upper - coef]]

                ax.errorbar(
                    j + i * width - width,  # Shift x slightly per method
                    coef,
                    yerr=error,
                    fmt='o',
                    capsize=4,
                    label=method if j == 0 else "",  # only add label once per method
                    color=color_map.get(method, 'gray')
                )

    #ax.set_title(f"{category}")
    ax.set_xticks(range(len(metrics)))
    ax.set_xticklabels(metrics, rotation=0, ha='right')
    ax.set_ylabel("")
    ax.axhline(0, color='black', linewidth=0.8, linestyle='--')
    if category== "Exp. Advant.": 
        ax.legend()
        sns.move_legend(ax, "upper left", bbox_to_anchor=(-.05, 1.4),ncols=3)

fig.supylabel("Prompt effect (all models)")
plt.subplots_adjust(wspace=0., hspace=.45, left=.2)
plt.show()


In [None]:
#fig.savefig("../persona_performance_paper/media/mitigation_coefs.pdf", bbox_inches="tight")

In [None]:
all_coefs = pd.DataFrame()
all_metrics = []
for idx, df in enumerate(mitigation_dfs):
    md = smf.mixedlm("score ~  0 + C(method)", df, groups=df["modelTask"])
    mdf = md.fit()
    coefs = mdf.summary().tables[1]
    coefs["Coef."] = coefs["Coef."].astype("float")
    coefs = coefs.rename(index=lambda x: x.split("[")[-1].rstrip("]"))
    display(coefs)
    all_metrics.extend(4*[metric_names[idx]])
    all_coefs = pd.concat([all_coefs, coefs.loc[list(mitigation_map.values())]], axis=0)

In [None]:
table= all_coefs.astype("float")

In [None]:
table["metric"] = all_metrics

In [None]:
table["metric"] = table.metric.astype(pd.CategoricalDtype(categories=metric_names, ordered=True))

In [None]:
table["metric_category"] = table.metric.map(lambda x: x.split("\n")[0]).astype(pd.CategoricalDtype(categories=metric_categories, ordered=True))

In [None]:
table = table.sort_values(["metric_category", "metric"])

In [None]:
table= table.reset_index()

In [None]:
table = table.rename(columns=lambda x: "Method" if x == "index" else x)

In [None]:
# Clean up the metric names
table['metric_clean'] = table['metric']

# Group by metric category
categories = table['metric_category'].unique()
n_categories = len(categories)

# Set up the figure
fig, axes = plt.subplots(n_categories, 1, figsize=(6.5, 3.5), sharey=False)

if n_categories == 1:
    axes = [axes]  # ensure axes is always iterable

for ax, category in zip(axes, categories):
    subset = table[table['metric_category'] == category]

    # We'll plot by metric_clean on x, and Method within each x as grouped points
    metrics = subset['metric_clean'].unique()
    methods = list(mitigation_map.values())
    x_locs = range(len(metrics))
    width = 0.2

    # For color/legend
    color_map = {"Base prompt": "black",'Instruction': 'tab:blue', 'Refine': 'tab:orange', 'Refine + Instruction': 'tab:green'}
    marker_map = {
            "Base prompt": "o",      # circle
            "Instruction": "s",      # square
            "Refine": "D",           # diamond
            "Refine + Instruction": "^"  # triangle_up
        }

    for i, method in enumerate(methods):
        method_data = subset[subset['Method'] == method]
        for j, metric in enumerate(metrics):
            point = method_data[method_data['metric_clean'] == metric]
            if not point.empty:
                coef = point['Coef.'].values[0]
                lower = point['[0.025'].values[0]
                upper = point['0.975]'].values[0]
                error = [[coef - lower], [upper - coef]]

                ax.errorbar(
                    j + i * width - width,  # Shift x slightly per method
                    coef,
                    yerr=error,
                    fmt=marker_map.get(method, 'o'),
                    capsize=6,
                    label=method if j == 0 else "",  # only add label once per method
                    color=color_map.get(method, 'gray')
                )

    #ax.set_title(f"{category}")
    ax.set_xticks(range(len(metrics)))
    ax.set_xticklabels(metrics, rotation=0, ha='center')
    ax.set_ylabel("")
    ax.axhline(0, color='black', linewidth=0.8, linestyle='--')
    # if category== "Exp. Advant.": 
    #     ax.legend()
    #     sns.move_legend(ax, "upper left", bbox_to_anchor=(-.05, 1.65),ncols=2)

fig.supylabel("Prompt effect (all models)")
plt.subplots_adjust(wspace=0., hspace=.95, left=.15)
plt.show()


In [None]:
fig.savefig("../persona_performance_paper/media/mitigation_coefs_nopersona.pdf", bbox_inches="tight")

In [None]:
over_df = mitigation_metrics_from_large[mitigation_metrics_from_large.metric == "OP"]
name_rob = mitigation_metrics_from_large[mitigation_metrics_from_large.metric == "WU_name"]
color_rob = mitigation_metrics_from_large[mitigation_metrics_from_large.metric == "WU_color"]
expert_fid = mitigation_metrics_from_large[mitigation_metrics_from_large.metric == "Fid_Exp"]
education_fid = mitigation_metrics_from_large[mitigation_metrics_from_large.metric == "Fid_Ed"]
expert_level_fid = mitigation_metrics_from_large[mitigation_metrics_from_large.metric == "Fid_ExpLevel"]

In [None]:
mitigation_dfs = [over_df, expert1_performance_large, expert2_performance_large, expert3_performance_large, color_rob, name_rob, expert_fid, education_fid, expert_level_fid]

In [None]:
metrics = ["Exp. Advant.\n(static)", "Exp. Advant.\n(broad)", "Exp. Advant.\n(focused)", "Exp. Advant.\n(niche)", "Robustness\n(color)", "Robustness\n(name)", "Fidelity\n(domain match)", "Fidelity\n(education)", "Fidelity\n(expertise level)"]

In [None]:
for idx, df in enumerate(mitigation_dfs):
    df = df.copy()
    df.method = df.method.astype(pd.CategoricalDtype(categories=list(mitigation_map.keys()), ordered=True))
    df = df.sort_values("method")
    df.method = df.method.apply(lambda x: mitigation_map[x])
    mitigation_dfs[idx] = df

In [None]:
all_coefs = pd.DataFrame()
all_metrics = []
for idx, df in enumerate(mitigation_dfs):
    md = smf.mixedlm("score ~  C(method, Treatment(reference='Base prompt'))", df, groups=df["modelTask"])
    
    mdf = md.fit()
    coefs = mdf.summary().tables[1]
    coefs["Coef."] = coefs["Coef."].astype("float")
    coefs = coefs.rename(index=lambda x: x.split("T.")[-1].rstrip("]"))
    display(coefs)
    all_metrics.extend(3*[metrics[idx]])
    all_coefs = pd.concat([all_coefs, coefs.loc[list(mitigation_map.values())[1:]]], axis=0)

In [None]:
table= all_coefs.astype("float")

In [None]:
table["metric"] = all_metrics

In [None]:
table["metric"] = table.metric.astype(pd.CategoricalDtype(categories=metric_names, ordered=True))

In [None]:
table["metric_category"] = table.metric.map(lambda x: x.split("\n")[0]).astype(pd.CategoricalDtype(categories=metric_categories, ordered=True))

In [None]:
table = table.sort_values(["metric_category", "metric"])

In [None]:
table= table.reset_index()

In [None]:
table = table.rename(columns=lambda x: "Method" if x == "index" else x)

In [None]:
# Clean up the metric names
table['metric_clean'] = table['metric']

# Group by metric category
categories = table['metric_category'].unique()
n_categories = len(categories)

# Set up the figure
fig, axes = plt.subplots(n_categories, 1, figsize=(6.5, 6), sharey=False)

if n_categories == 1:
    axes = [axes]  # ensure axes is always iterable

for ax, category in zip(axes, categories):
    subset = table[table['metric_category'] == category]

    # We'll plot by metric_clean on x, and Method within each x as grouped points
    metrics = subset['metric_clean'].unique()
    methods = list(mitigation_map.values())[1:]
    x_locs = range(len(metrics))
    width = 0.2

    # For color/legend
    color_map = {'Instruction': 'tab:blue', 'Refine': 'tab:orange', 'Refine + Instruction': 'tab:green'}

    for i, method in enumerate(methods):
        method_data = subset[subset['Method'] == method]
        for j, metric in enumerate(metrics):
            point = method_data[method_data['metric_clean'] == metric]
            if not point.empty:
                coef = point['Coef.'].values[0]
                lower = point['[0.025'].values[0]
                upper = point['0.975]'].values[0]
                error = [[coef - lower], [upper - coef]]

                ax.errorbar(
                    j + i * width - width,  # Shift x slightly per method
                    coef,
                    yerr=error,
                    fmt='o',
                    capsize=4,
                    label=method if j == 0 else "",  # only add label once per method
                    color=color_map.get(method, 'gray')
                )

    #ax.set_title(f"{category}")
    ax.set_xticks(range(len(metrics)))
    ax.set_xticklabels(metrics, rotation=0, ha='center')
    ax.set_ylabel("")
    ax.axhline(0, color='black', linewidth=0.8, linestyle='--')
    if category== "Exp. Advant.": 
        ax.legend()
        sns.move_legend(ax, "upper left", bbox_to_anchor=(-.05, 1.4),ncols=3)

fig.supylabel("Mitigation effect (large models)")
plt.subplots_adjust(wspace=0., hspace=.45, left=.15)
plt.show()


In [None]:
#fig.savefig("../persona_performance_paper/media/mitigation_coefs_large.pdf", bbox_inches="tight")

In [None]:
metrics = ["Exp. Advant.\n(static)", "Exp. Advant.\n(broad)", "Exp. Advant.\n(focused)", "Exp. Advant.\n(niche)", "Robustness\n(color)", "Robustness\n(name)", "Fidelity\n(domain match)", "Fidelity\n(education)", "Fidelity\n(expertise level)"]

In [None]:
mitigation_dfs[-1].groupby("method").score.sem()

In [None]:
mdf.scale

In [None]:
np.diag(mdf.cov_params())

In [None]:
all_coefs = pd.DataFrame()
all_metrics = []
for idx, df in enumerate(mitigation_dfs):
    md = smf.mixedlm("score ~ 0+ C(method)", df, groups=df["modelTask"])
    
    mdf = md.fit()
    coefs = mdf.summary().tables[1]
    coefs["Coef."] = coefs["Coef."].astype("float")
    coefs = coefs.rename(index=lambda x: x.split("[")[-1].rstrip("]"))
    display(coefs)
    all_metrics.extend(4*[metrics[idx]])
    all_coefs = pd.concat([all_coefs, coefs.loc[list(mitigation_map.values())]], axis=0)

In [None]:
table= all_coefs.astype("float")

In [None]:
table["metric"] = all_metrics

In [None]:
table["metric"] = table.metric.astype(pd.CategoricalDtype(categories=metric_names, ordered=True))

In [None]:
table["metric_category"] = table.metric.map(lambda x: x.split("\n")[0]).astype(pd.CategoricalDtype(categories=metric_categories, ordered=True))

In [None]:
table = table.sort_values(["metric_category", "metric"])

In [None]:
table= table.reset_index()

In [None]:
table = table.rename(columns=lambda x: "Method" if x == "index" else x)

In [None]:
# Clean up the metric names
table['metric_clean'] = table['metric']

# Group by metric category
categories = table['metric_category'].unique()
n_categories = len(categories)

# Set up the figure
fig, axes = plt.subplots(n_categories, 1, figsize=(6.5, 3.5), sharey=False)

if n_categories == 1:
    axes = [axes]  # ensure axes is always iterable

for ax, category in zip(axes, categories):
    subset = table[table['metric_category'] == category]

    # We'll plot by metric_clean on x, and Method within each x as grouped points
    metrics = subset['metric_clean'].unique()
    methods = list(mitigation_map.values())
    x_locs = range(len(metrics))
    width = 0.2

    # For color/legend
    color_map = {"Base prompt": "black",'Instruction': 'tab:blue', 'Refine': 'tab:orange', 'Refine + Instruction': 'tab:green'}
    marker_map = {
            "Base prompt": "o",      # circle
            "Instruction": "s",      # square
            "Refine": "D",           # diamond
            "Refine + Instruction": "^"  # triangle_up
        }

    for i, method in enumerate(methods):
        method_data = subset[subset['Method'] == method]
        for j, metric in enumerate(metrics):
            point = method_data[method_data['metric_clean'] == metric]
            if not point.empty:
                coef = point['Coef.'].values[0]
                lower = point['[0.025'].values[0]
                upper = point['0.975]'].values[0]
                error = [[coef - lower], [upper - coef]]

                ax.errorbar(
                    j + i * width - width,  # Shift x slightly per method
                    coef,
                    yerr=error,
                    fmt=marker_map.get(method, 'o'),
                    capsize=6,
                    label=method if j == 0 else "",  # only add label once per method
                    color=color_map.get(method, 'gray')
                )

    #ax.set_title(f"{category}")
    ax.set_xticks(range(len(metrics)))
    ax.set_xticklabels(metrics, rotation=0, ha='center')
    ax.set_ylabel("")
    ax.axhline(0, color='black', linewidth=0.8, linestyle='--')
    # if category== "Exp. Advant.": 
    #     ax.legend()
    #     sns.move_legend(ax, "upper left", bbox_to_anchor=(-.05, 1.65),ncols=2)

fig.supylabel("Prompt effect (large only)")
plt.subplots_adjust(wspace=0., hspace=.95, left=.15)
plt.show()


In [None]:
fig.savefig("../persona_performance_paper/media/mitigation_coefs_large_nopersona.pdf", bbox_inches="tight")