In [5]:
%load_ext autoreload
%autoreload 2

In [19]:
import numpy as np
import pickle
import pandas as pd

import statsmodels.stats.api as sms

In [20]:
with open("mldb_2021-07-16.pickle", 'rb') as f:
    evals = pickle.load(f)

In [21]:
df = pd.DataFrame(evals)

In [22]:
# Ignore checkpoints for now
df = df[df.epoch == -1]

In [23]:
# Add accurcy metric and ci
df['accuracy'] = df.stats.apply(lambda x: x['num_correct_and_size'][0] / x['num_correct_and_size'][1])

num_correct_vals = df.stats.apply(lambda x: x['num_correct_and_size'][0]).values
size_vals = df.stats.apply(lambda x: x['num_correct_and_size'][1]).values
cis = sms.proportion_confint(num_correct_vals, size_vals, alpha=0.05, method='beta')
df['accuracy_ci'] = list(zip(*cis))

In [38]:
# Add macro_f1 metric and ci
def confint(acc, n, alpha=0.05, method="beta"):
    return sms.proportion_confint(acc * n, n, alpha=alpha, method=method)

def worst_region_acc_ci(ev):
        """Compute Clopper-Pearson CI for the worst-region subgroup."""
        # Find the number of points in the worst-region
        if 'wilds_metrics' not in ev or 'acc_worst_region' not in ev['wilds_metrics']:
            return (0., 0.)
        regions = ["Asia", "Europe", "Africa", "Americas", "Oceania", "Other"]
        worst_acc = ev['wilds_metrics']["acc_worst_region"]
        worst_region_size = None
        for region in regions:
            if np.isclose(worst_acc, ev['wilds_metrics'][f"acc_region:{region}"]):
                worst_region_size = ev['wilds_metrics'][f"count_region:{region}"]
                break
        assert worst_region_size is not None
        # Note: This confidence interval isn't exactly correct because we took
        # a max over the worst-region first...
        num_correct = int(worst_region_size * worst_acc)
        return sms.proportion_confint(
            num_correct, worst_region_size, alpha=0.05, method="beta"
        )

df['macro_f1'] = df.stats.apply(lambda x: x.get('wilds_metrics', {}).get('F1-macro_all', None))
df['macro_f1_ci'] = df.stats.apply(lambda x: x.get('iwc_f1_approx_ci_95', (0., 0.))) # TEMPORARY

df['worst_region_accuracy'] = df.stats.apply(lambda x: x.get('wilds_metrics', {}).get('acc_worst_region', None))
df['worst_region_accuracy_ci'] = df.stats.apply(worst_region_acc_ci)



In [39]:
# Pairs of id-train, id-test
ID_PAIRS = [
    ("cifar10-train", "cifar10-test"),
    ("cifar10-train", "cifar10-STL10classes"),
    ("FMoW-train", "FMoW-id_test"),
    ("FMoW-train", "FMoW-id_val"),
    ("Camelyon17-train", "Camelyon17-id_val"),
    ("Camelyon17-train", "Camelyon17-id_test"),
    ("IWildCamOfficialV2-train", "IWildCamOfficialV2-id_val"),
    ("IWildCamOfficialV2-train", "IWildCamOfficialV2-id_test"),
]

In [40]:
def reformat(_df, train, test):
    test_eval = _df[_df.test_set == test]
    if len(test_eval) == 0:
        return pd.DataFrame()
    test_eval = test_eval.iloc[0]
    shift_evals = _df[~_df.test_set.isin([train, test])]
    newdf = shift_evals[["model_family", "model_id", "epoch", "rule_params"]]
    newdf = newdf.rename(columns={"rule_params": "hyperparameters"})
    newdf["train_set"] = train
    newdf["test_set"] = test
    newdf["shift_set"] = shift_evals["test_set"]
    for metric in ["accuracy", "macro_f1", "worst_region_accuracy"]:
        newdf[f"test_{metric}"] = test_eval[metric]
        newdf[f"test_{metric}_ci"] = [test_eval[f"{metric}_ci"] for _ in range(len(newdf))]
        newdf[f"shift_{metric}"] = shift_evals[metric]
        newdf[f"shift_{metric}_ci"] = shift_evals[f"{metric}_ci"]
    return newdf

In [41]:
new_df = []
for train, test in ID_PAIRS:
    df_train = df[df.train_set == train]
    shift_sets = set(df_train.test_set) - set([train, test])
    new_df.extend([reformat(modeldf, train, test) for _, modeldf in df_train.groupby("model_id")])

In [42]:
new_df = pd.concat(new_df)

In [43]:
def rename_model_family(model_family):
    if "RandFeatures" in model_family:
        return "RandomFeatures"
    elif "K_nearest_neighbors" in model_family:
        return "KNN"
    return model_family

new_df["model_family"] = new_df.model_family.apply(rename_model_family)

In [44]:
def rename_test_set(ts):
    if ts == "cifar10-STL10classes":
        return "cifar10-test-STL10classes"
    return ts
new_df["test_set"] = new_df.test_set.apply(rename_test_set)

In [45]:
new_df.to_csv("results.csv", index=False)