In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
from scipy.stats import hmean
from scipy.stats import kendalltau, pearsonr
from IPython.display import display, HTML

## Fine-grained results image

In [None]:
def get_results(task):
    results = pd.read_csv(f"./data/{task}/results_fine.csv")
    results["method"] = results["method"].str.replace("default", "Vanilla")
    results["method"] = results["method"].str.replace("l2", "L2")
    results["method"] = results["method"].str.replace("dropout", "Dropout")
    results["method"] = results["method"].str.replace("freeze", "LP")
    results["method"] = results["method"].str.replace("lp-ft", "LP-FT")
    results["method"] = results["method"].str.replace("irm", "IRM")
    results["method"] = results["method"].str.replace("dro", "G-DRO")
    results["method"] = results["method"].str.replace("fish", "Fish")
    results = results.set_index(["method", "config", "score"])

    results["avg"] =  results.mean(axis=1)

    cols = results.columns.tolist()

    cols = [cols[-1]] + cols[0:-1]

    results = results[cols]

    n_funcs = len(results.select_dtypes(include=np.number).columns.tolist())

    func_dic = pickle.load(open(f"./data/{task}/func_dic.pkl", "rb"))

    capabilities = np.array([v[0] for v in func_dic.values()])

    test_types = np.array([v[1] for v in func_dic.values()])

    if task == "qqp": # Reorder to group the vocabulary functionalities
        test_types[7], test_types[10] = test_types[10], test_types[7]
        capabilities[7], capabilities[10] = capabilities[10], capabilities[7]
        cols[8], cols[11] = cols[11], cols[8]
        results = results[cols]


    capabilities = np.concatenate([["avg"], capabilities])

    capabilities_idx = np.where(capabilities[:-1] != capabilities[1:])[0] +1

    type_to_color = {"mft": "b", "inv": "r", "dir": "g"}

    type_colors = ["k"] + [type_to_color[t] for t in test_types]

    y_labels = [m if m != "Vanilla" else f"{c}: Vanilla" for  m, c, s in list(results[results.index.isin(['standard', "seen"], level=2)].index)]
    
    return results, y_labels, type_colors, capabilities_idx, n_funcs

In [None]:
def create_plot(results, score, ax):
    sns.heatmap(results[results.index.isin(['standard', score], level=2)],
    #             annot=True,
                linewidths=1, ax = ax,
                xticklabels=["Avg"] + list(range(1, n_funcs)),
                yticklabels = y_labels if score == "seen" else [],
                vmin=0,
                vmax=100,
                cbar=False
    #             fmt=".0f"
               )
    ax.set_xticks(capabilities_idx, minor=True)
    ax.set_yticks([1, 9, 17], minor=True)
#     ax.set_ylabel("Configuation, method") if score == "seen" else ax.set_ylabel("")
    ax.set_ylabel("")
    ax.grid(True, which='minor', linewidth=.5, color="blue")
    for ticklabel, tickcolor in zip(ax.get_xticklabels(), type_colors):
        ticklabel.set_color(tickcolor)

In [None]:
plt.rcParams.update({'font.size': 18})
plt.rc('xtick', labelsize=12)

In [None]:
fig, axs = plt.subplots(figsize=(32,24), ncols=5, nrows=3, gridspec_kw=dict(width_ratios=[1, 1, 1, 1, 0.05], wspace=0.1, hspace=0.1))
for row, task in enumerate(["sa", "qqp", "squad"]):
    results, y_labels, type_colors, capabilities_idx, n_funcs = get_results(task)
    for col, score in enumerate(["seen", "funcOut", "classOut", "aspectOut"]):
        create_plot(results, score, axs[row, col])
    fig.colorbar(axs[0,0].collections[0], cax=axs[row, 4])
fig.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0, hspace=0)

## Agg-results table

In [None]:
tasks = ["sa", "qqp", "squad"]
results = [pd.read_csv(f"./data/{task}/results.csv", sep="\t").dropna() for task in tasks]

In [None]:
for result in results:
    result["Seen"] = hmean([result["Seen"], result["iid score"]])
    result["Func"] = hmean([result["Func"], result["iid score"]])
    result["Class"] = hmean([result["Class"], result["iid score"]])
    result["Aspect"] = hmean([result["Aspect"], result["iid score"]])

In [None]:
table = pd.DataFrame()
table[["Method", "Config"]] = results[0][["Method", "Config"]]

In [None]:
for i, task in enumerate(tasks):
    table[f"iid score {task}"] = results[i]["iid score"]
for i, task in enumerate(tasks):
    table[[f"Seen {task}", f"Func {task}", f"Class {task}", f"Aspect {task}"]] = results[i][["Seen", "Func", "Class", "Aspect"]]

In [None]:
table["Avg"] = table.iloc[:,5:].mean(axis=1)

In [None]:
table = table.set_index(["Config", "Method"])

In [None]:
dataset_pvalues = pd.read_csv("./data/dataset_pvalues.csv", index_col=[0,1])

In [None]:
suite_pvalues = pd.read_csv("./data/pvalues_suite_avg.csv", index_col=[0,1])

In [None]:
pvalues = pd.concat([dataset_pvalues, suite_pvalues], axis=1)

In [None]:
significant = pvalues < .05

In [None]:
significant.loc[("iid")] = 16 * [False]

In [None]:
idxs = significant.index.to_list()

In [None]:
order = [idxs[-1]] + idxs[:-1]

In [None]:
significant = significant.reindex(order)

In [None]:
significant.index = table.index

In [None]:
significant.columns = table.columns

In [None]:
significant

In [None]:
l = table.style.highlight_max(axis=0,
                           props='textbf:--rwrap;')

In [None]:
l = l.format(precision=2)

In [None]:
def color_values(df, significant):
    better = "textcolor: {PineGreen} --rwrap;"
    worse = "textcolor: {red} --rwrap;"
    same = '' 
    df1 =  pd.DataFrame(same, index=df.index, columns=df.columns)
    b = df >= df.iloc[0]
    w = df< df.iloc[0]
    return df1.mask(b, better).mask(w, worse).mask(~significant, same)

In [None]:
from functools import partial

color = partial(color_values, significant=significant)

In [None]:
l = l.apply(color, axis=None)

In [None]:
print(l.to_latex(multirow_align="t"))