# Analyse Results

In [None]:
from average_results import main
from src.data import get_dataset_options

import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import tabulate

from matplotlib.colors import LinearSegmentedColormap
from scipy.stats import kendalltau


my_gradient = LinearSegmentedColormap.from_list('my_gradient', (
    # Edit this gradient at https://eltos.github.io/gradient/#00B20F-82FB39-EAF0E0-FF3540-AC1E25
    (0.000, (0.000, 0.698, 0.059)),
    (0.250, (0.510, 0.984, 0.224)),
    (0.500, (0.918, 0.941, 0.878)),
    (0.750, (1.000, 0.208, 0.251)),
    (1.000, (0.675, 0.118, 0.145))))

In [None]:
if not os.path.exists('analysis_images'):
    os.makedirs('analysis_images')
if not os.path.exists('selected_performances'):
    os.makedirs('selected_performances')

First we define several helper functions to simplify the analysis

In [None]:
def load_results(experiment_paths, as_file=False, root_dir='.'):
    experiment_paths_list = experiment_paths.split()
    # remove the version number from experiment_paths
    # find location of last v[0-9] in each path
    all_paths = os.listdir(root_dir)
    experiment_paths_dict = {}
    save_path_details = {}
    for experiment_path in experiment_paths_list:
        v_start = re.search(r'v[0-9]+', experiment_path).start() - 1
        # find the full path of experiment_path and take the latest one if multiple exist
        full_path = sorted([path for path in all_paths if experiment_path == path])[-1]
        full_path = os.path.join(root_dir, full_path)

        if experiment_path[:v_start] not in experiment_paths_dict:
            experiment_paths_dict[experiment_path[:v_start]] = [full_path]
            save_path_details[experiment_path[:v_start]] = {'dataset': full_path.split('-')[0],
                                                            'architecture': full_path.split('-')[1]}
        else:
            experiment_paths_dict[experiment_path[:v_start]].append(full_path)

    if as_file:
        for label in experiment_paths_dict:
            save_path = './runs/' + save_path_details[label]['dataset'] + '/' + save_path_details[label]['architecture'] + '/' + 'averages/' 
            main(experiment_paths_dict[label], label, save_path)
    else:
        results_dict = {}
        for label in experiment_paths_dict:
            results = main(experiment_paths_dict[label], label)
            results_dict[label] = results

        return results_dict

In [None]:
def load_results_multiple_datasets(experiment_paths_groups, as_file=False, root_dir='.'):
    group_results_dict = {}
    for experiment_paths in experiment_paths_groups:
        dataset_name = experiment_paths.split()[0].split('-')[0]
        group_results_dict[dataset_name] = load_results(experiment_paths, as_file=as_file, root_dir=root_dir)

    group_results_dict_formatted = {}
    for dataset_name in group_results_dict:
        dataset_dict = group_results_dict[dataset_name]
        dataset_dict_formatted = {}
        for label in dataset_dict:
            if 'top' in label:
                if 'top2-direct' in label:
                    dataset_dict_formatted[label.replace('top2-direct', 'top2direct')] = dataset_dict[label]
                elif 'top3-direct' in label:
                    dataset_dict_formatted[label.replace('top3-direct', 'top3direct')] = dataset_dict[label]
                elif 'top2' in label:
                    dataset_dict_formatted[label.replace('top2', 'top2opt')] = dataset_dict[label]
                elif 'top3' in label:
                    dataset_dict_formatted[label.replace('top3', 'top3opt')] = dataset_dict[label]
            else:
                dataset_dict_formatted[label] = dataset_dict[label]
        group_results_dict_formatted[dataset_name] = dataset_dict_formatted

    return group_results_dict_formatted

In [None]:
def print_table_multiple_datasets(group_results_dict, metric="error", show_std=True, num_dp=2, perc=True):
    ordered_datasets = ["svhn", "cifar10", "cifar100", "tinyimagenet", "newsgroup", "sst", "rotated_cifar100", "wiki_face", "regression_energy", "regression_boston", "regression_wine", "regression_yacht", "regression_concrete", "classification_wine", "classification_toxicity", "classification_abalone", "classification_students", "classification_adult"]
    present_datasets = []
    id_tables = {}
    ood_tables = {}
    first_dataset = True

    if metric == "error" or metric == "ece":
        perc = True
    else:
        perc = False
    
    for dataset in ordered_datasets:
        if dataset not in group_results_dict:
            continue
        present_datasets.append(dataset)
        results_dict = group_results_dict[dataset]
        assert dataset == list(results_dict.keys())[0].split('-')[0]

        print()
        label = '-'.join(list(results_dict.keys())[0].split('-')[:2])
    
        label_mapping = {
        'vanilla': 'No Noise',
        'target_smoothing': 'Label Smoothing',
        'input_ods': 'Input ODS',
        'input_augmix': 'Input AugMix',
        'input_target_mixup': 'Input-Target MixUp',
        'input_target_cmixup': 'Input-Target CMixUp',
        'activation_dropout': 'Activation Dropout',
        'gradient_gaussian': 'Gradient Gaussian',
        'model_sp': 'Model',
        'top2direct': 'Top-2 Direct',
        'top3direct': 'Top-3 Direct', 
        'top2opt': 'Top-2 Optimised',
        'top3opt': 'Top-3 Optimised',
        'input_additive_gaussian': 'Input Gaussian',
        'activation_additive_gaussian': 'Activation Gaussian',
        'input_random_crop_horizontal_flip': 'Input Weak Aug.',
        'weight_additive_gaussian': 'Weight Gaussian',
        'weight_dropconnect': 'Weight DropConnect',
        }

        ordered_labels = ['No Noise', 'Input Weak Aug.', 'Input Gaussian', 'Input ODS', 'Input AugMix', 'Input-Target MixUp', 'Input-Target CMixUp', 'Label Smoothing', 'Activation Gaussian', 'Activation Dropout', 'Gradient Gaussian', 'Model', 'Weight Gaussian', 'Weight DropConnect', 'Top-2 Direct', 'Top-3 Direct', 'Top-2 Optimised', 'Top-3 Optimised']
        
        relabeled_results = {}
        for key in results_dict:
            for label_key in label_mapping:
                if label_key in key:
                    new_key = label_key
            relabeled_results[label_mapping[new_key]] = results_dict[key]

        # maintain the preferred order of labels
        labels = []
        for key in ordered_labels:
            if key in relabeled_results:
                labels.append(key)

        results = relabeled_results

        splits = ["test"]
        _, _, _, levels, augmentation_types = get_dataset_options(dataset)
        for i, augmentation in enumerate(augmentation_types):
            for level in levels[i]:
                splits += ["{}_{}".format(augmentation, level)]

        # Cache the first table because it will be for the test split
        first_table = None
        for split in splits:
            # Create a table which will compare all the methods on a split with the metric
            # The first column will be the method names
            table = [[label] for label in labels]

            for label, result in results.items():
                datapoint = result[metric][split]
                if isinstance(datapoint, float):
                    # This stands for no standard deviation, since the result was not averaged
                    datapoint = (datapoint, 0.0)
                if perc:
                    datapoint = (datapoint[0]*100, datapoint[1]*100)
                if show_std:
                    if num_dp == 2:
                        if len("{:.2f}".format(datapoint[0])) < 5:
                            table[labels.index(label)].append(
                                "\phantom{0}" + "{:.2f}".format(datapoint[0]) + "\\tiny{$\pm$" + "{:.2f}".format(datapoint[1]) + "}")
                        else:
                            table[labels.index(label)].append(
                                "{:.2f}".format(datapoint[0]) + "\\tiny{$\pm$" + "{:.2f}".format(datapoint[1]) + "}")
                    else:
                        if len("{:.1f}".format(datapoint[0])) < 4:
                            table[labels.index(label)].append(
                                "\phantom{0}" + "{:.1f}".format(datapoint[0]) + "\\tiny{$\pm$" + "{:.1f}".format(datapoint[1]) + "}")
                        else:
                            table[labels.index(label)].append(
                                "{:.1f}".format(datapoint[0]) + "\\tiny{$\pm$" + "{:.1f}".format(datapoint[1]) + "}")
                else:
                    if len("{:.2f}".format(datapoint[0])) < 5:
                        table[labels.index(label)].append(
                            "\phantom{0}"+"{:.2f}".format(datapoint[0]))
                    else:
                        table[labels.index(label)].append(
                            "{:.2f}".format(datapoint[0]))
                    
            if first_table is None:
                first_table = table
        
        if first_dataset:
            id_tables[dataset] = first_table
        else:
            # remove the first column from the table
            first_table = [row[1:] for row in first_table]
            id_tables[dataset] = first_table
        first_dataset = False

        # This is going to aggregate all the augmentations together
        # Again create a table which will compare all the methods on all the splits together with all the metrics
        table = [[label] for label in labels]

        splits = ["ood_test"]
        for split in splits:
            # Create a table which will compare all the methods on a split with the metric
            # The first column will be the method names
            table = [[label] for label in labels]

            for label, result in results.items():
                datapoint = result[metric][split]

                if isinstance(datapoint, float):
                    # This stands for no standard deviation, since the result was not averaged
                    datapoint = (datapoint, 0.0)
                if perc:
                    datapoint = (datapoint[0]*100, datapoint[1]*100)
                # check if nan
                if datapoint[0] != datapoint[0]:
                    table[labels.index(label)].append("N/A")    
                else:
                    if show_std:
                        if num_dp == 2:
                            if len("{:.2f}".format(datapoint[0])) < 5:
                                table[labels.index(label)].append(
                                    "\phantom{0}" + "{:.2f}".format(datapoint[0]) + "\\tiny{$\pm$" + "{:.2f}".format(datapoint[1]) + "}")
                            else:
                                table[labels.index(label)].append(
                                    "{:.2f}".format(datapoint[0]) + "\\tiny{$\pm$" + "{:.2f}".format(datapoint[1]) + "}")
                        else:
                            if len("{:.1f}".format(datapoint[0])) < 4:
                                table[labels.index(label)].append(
                                    "\phantom{0}" + "{:.1f}".format(datapoint[0]) + "\\tiny{$\pm$" + "{:.1f}".format(datapoint[1]) + "}")
                            else:
                                table[labels.index(label)].append(
                                    "{:.1f}".format(datapoint[0]) + "\\tiny{$\pm$" + "{:.1f}".format(datapoint[1]) + "}")
                    else:
                        if len("{:.2f}".format(datapoint[0])) < 5:
                            table[labels.index(label)].append(
                                "\phantom{0}"+"{:.2f}".format(datapoint[0]))
                        else:
                            table[labels.index(label)].append(
                                "{:.2f}".format(datapoint[0]))    

        # Merge the last table with the first one
        augmentation_table = table
        # Remove the first column from the augmentation table
        augmentation_table = [row[1:] for row in augmentation_table]

        ood_tables[dataset] = augmentation_table


    # Merge the tables
    table = []
    for method_idx in range(len(id_tables[present_datasets[0]])):
        results = [id_tables[dataset][method_idx] + ood_tables[dataset][method_idx] for dataset in present_datasets]
        # flatten the list
        results = [item for sublist in results for item in sublist]
        table.append(results)

    print('Metric: {}'.format(metric))
    headers = ["Method"]
    for dataset in present_datasets:
        headers += [dataset + ': ID']
        headers += [dataset + ': OOD']

    # Print the table
    print(tabulate.tabulate(table, headers=headers, tablefmt="latex_raw", floatfmt=".1f"))

In [None]:
def print_rank_table_multiple_datasets(group_results_dict, metric="error", no_print=False, raw_ranks=False):
    ordered_datasets = ["svhn", "cifar10", "cifar100", "tinyimagenet", "newsgroup", "sst", "rotated_cifar100", "wiki_face", "regression_energy", "regression_boston", "regression_wine", "regression_yacht", "regression_concrete", "classification_wine", "classification_toxicity", "classification_abalone", "classification_students", "classification_adult"]
    present_datasets = []
    id_tables = {}
    ood_tables = {}
    first_dataset = True

    for dataset in ordered_datasets:
        if dataset not in group_results_dict:
            continue
        present_datasets.append(dataset)
        results_dict = group_results_dict[dataset]
        assert dataset == list(results_dict.keys())[0].split('-')[0]

        label = '-'.join(list(results_dict.keys())[0].split('-')[:2])
    
        label_mapping = {
        'vanilla': 'No Noise',
        'target_smoothing': 'Label Smoothing',
        'input_ods': 'Input ODS',
        'input_augmix': 'Input AugMix',
        'input_target_mixup': 'Input-Target MixUp',
        'input_target_cmixup': 'Input-Target CMixUp',
        'activation_dropout': 'Activation Dropout',
        'gradient_gaussian': 'Gradient Gaussian',
        'model_sp': 'Model',
        'top2direct': 'Top-2 Direct',
        'top3direct': 'Top-3 Direct', 
        'top2opt': 'Top-2 Optimised',
        'top3opt': 'Top-3 Optimised',
        'input_additive_gaussian': 'Input Gaussian',
        'activation_additive_gaussian': 'Activation Gaussian',
        'input_random_crop_horizontal_flip': 'Input Weak Aug.',
        'weight_additive_gaussian': 'Weight Gaussian',
        'weight_dropconnect': 'Weight DropConnect',
        }

        ordered_labels = ['No Noise', 'Input Weak Aug.', 'Input Gaussian', 'Input ODS', 'Input AugMix', 'Input-Target MixUp', 'Input-Target CMixUp', 'Label Smoothing', 'Activation Gaussian', 'Activation Dropout', 'Gradient Gaussian', 'Model', 'Weight Gaussian', 'Weight DropConnect', 'Top-2 Direct', 'Top-3 Direct', 'Top-2 Optimised', 'Top-3 Optimised']
        
        relabeled_results = {}
        for key in results_dict:
            for label_key in label_mapping:
                if label_key in key:
                    new_key = label_key
            relabeled_results[label_mapping[new_key]] = results_dict[key]

        # maintain the preferred order of labels
        labels = []
        for key in ordered_labels:
            if key in relabeled_results:
                labels.append(key)

        results = relabeled_results

        splits = ["test"]
        _, _, _, levels, augmentation_types = get_dataset_options(dataset)
        for i, augmentation in enumerate(augmentation_types):
            for level in levels[i]:
                splits += ["{}_{}".format(augmentation, level)]

        # Cache the first table because it will be for the test split
        first_table = None
        for split in splits:
            # Create a table which will compare all the methods on a split with the metric
            # The first column will be the method names
            table = [[label] for label in labels]

            for label, result in results.items():
                datapoint = result[metric][split]
                if isinstance(datapoint, float):
                    # This stands for no standard deviation, since the result was not averaged
                    datapoint = (datapoint, 0.0)

                table[labels.index(label)].append(datapoint[0])
                    
            if first_table is None:
                first_table = table
        
        if first_dataset:
            id_tables[dataset] = first_table
        else:
            # remove the first column from the table
            first_table = [row[1:] for row in first_table]
            id_tables[dataset] = first_table
        first_dataset = False

        # This is going to aggregate all the augmentations together
        # Again create a table which will compare all the methods on all the splits together with all the metrics
        table = [[label] for label in labels]

        splits = ["ood_test"]
        for split in splits:
            # Create a table which will compare all the methods on a split with the metric
            # The first column will be the method names
            table = [[label] for label in labels]

            for label, result in results.items():
                datapoint = result[metric][split]

                if isinstance(datapoint, float):
                    # This stands for no standard deviation, since the result was not averaged
                    datapoint = (datapoint, 0.0)

                # check if nan
                if datapoint[0] != datapoint[0]:
                    table[labels.index(label)].append(9999.9)    
                else:
                    table[labels.index(label)].append(datapoint[0])    

        # Merge the last table with the first one
        augmentation_table = table
        # Remove the first column from the augmentation table
        augmentation_table = [row[1:] for row in augmentation_table]

        ood_tables[dataset] = augmentation_table


    # Merge the tables
    table = []
    for method_idx in range(len(id_tables[present_datasets[0]])):
        results = [id_tables[dataset][method_idx] + ood_tables[dataset][method_idx] for dataset in present_datasets]
        # flatten the list
        results = [item for sublist in results for item in sublist]
        table.append(results)

    
    headers = ["Method"]
    for dataset in present_datasets:
        headers += [dataset + ': ID']
        headers += [dataset + ': OOD']

    # Print the table
    results_pd = pd.DataFrame(table, columns=headers)
    # calculate the rankings for each column
    for column in results_pd.columns[1:]:
        results_pd[column] = results_pd[column].astype(float)
        results_pd[column] = results_pd[column].rank(ascending=True)

    # print results_pd with no index
    results_pd_string = results_pd.to_string(index=False)

    # remove initial spaces from each row
    results_pd_string = '\n'.join([row.strip() for row in results_pd_string.split('\n')])

    # replace 2 and more spaces with a tab
    results_pd_string = '\n'.join(['\t'.join([word for word in row.split('  ') if word != '']) for row in results_pd_string.split('\n')])

    if no_print is False:
        print()
        print('Metric: {}'.format(metric))
        print(results_pd_string)

    if raw_ranks:
        return results_pd

    # subtract ranking of No Noise from all other rankings but keep the column Method
    # remember the content of the first column
    first_column = results_pd.iloc[:, 0]
    results_pd = results_pd.iloc[:, 1:]

    # subtract the first row from all other rows
    results_pd = results_pd - results_pd.iloc[0]

    # add again the first column
    results_pd.insert(0, 'Method', first_column)

    # delete the first row
    results_pd = results_pd.iloc[1:]

    # calculate the average ranking across columns that have ID in their name
    results_pd['ID'] = results_pd[[col for col in results_pd.columns if 'ID' in col]].mean(axis=1)
    # format this to one decimal place
    results_pd['ID'] = results_pd['ID'].apply(lambda x: round(x, 1))

    # calculate the average ranking across columns that have OOD in their name
    results_pd['OOD'] = results_pd[[col for col in results_pd.columns if 'OOD' in col]].mean(axis=1)
    # format this to one decimal place
    results_pd['OOD'] = results_pd['OOD'].apply(lambda x: round(x, 1))

    return results_pd

# Analysis of Results for TMLR Submission

In [None]:
result_summary_names = os.listdir("result_summaries")
result_summary_names = [name for name in result_summary_names if 'transfer' not in name]

# get the dataset names
dataset_names = set([name.split('-')[0] for name in result_summary_names])

dataset_experiments_dict = {}
for dataset_name in dataset_names:
    if dataset_name == "newsgroup" or dataset_name == "sst":
        experiments_list_cnn = []
        experiments_list_transformer = []
        # split it based on if the name is "newsgroup-global_pooling_cnn" or "newsgroup-transformer"
        for experiment_name in result_summary_names:
            if dataset_name + "-global_pooling_cnn" in experiment_name:
                experiments_list_cnn.append(experiment_name)
            elif dataset_name + "-transformer" in experiment_name:
                experiments_list_transformer.append(experiment_name)
        dataset_experiments_dict[dataset_name + "-global_pooling_cnn"] = experiments_list_cnn
        dataset_experiments_dict[dataset_name + "-transformer"] = experiments_list_transformer
    else:
        experiments_list = []
        for experiment_name in result_summary_names:
            if dataset_name == experiment_name.split('-')[0]:
                experiments_list.append(experiment_name)
        dataset_experiments_dict[dataset_name] = experiments_list

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["svhn", "cifar10", "cifar100", "tinyimagenet"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
error_table = print_rank_table_multiple_datasets(results_dict_groups, metric="error", no_print=True)
ece_table = print_rank_table_multiple_datasets(results_dict_groups, metric="ece", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in error_table['Method'].tolist():
    titles.append(title)

# create heatmap with the titles as columns and errors, eces and nlls as rows
fig, ax = plt.subplots(figsize=(8, 12))
im = ax.imshow([error_table['ID'], ece_table['ID'], nll_table['ID']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(3))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['Error','ECE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(3):
    for j in range(len(titles)):
        text = ax.text(j, i, [error_table['ID'].tolist(), ece_table['ID'].tolist(), nll_table['ID'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.009, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/id_classification_cv_top.pdf', bbox_inches='tight')
plt.show()
assert len(results_dict_groups['svhn'].keys()) - 1 == len(error_table['Method'].tolist())
overall_ranks_df = pd.DataFrame({'Method': error_table['Method'], 'ID': error_table['ID'] + ece_table['ID'] + nll_table['ID']})
overall_ranks_df



In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["svhn", "cifar10", "cifar100", "tinyimagenet"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

error_table = print_table_multiple_datasets(results_dict_groups, metric="error")
ece_table = print_table_multiple_datasets(results_dict_groups, metric="ece")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["classification_wine", "classification_toxicity", "classification_abalone", "classification_students", "classification_adult"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
error_table = print_rank_table_multiple_datasets(results_dict_groups, metric="error", no_print=True)
ece_table = print_rank_table_multiple_datasets(results_dict_groups, metric="ece", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in error_table['Method'].tolist():
    titles.append(title)

# create heatmap with the titles as columns and errors, eces and nlls as rows
fig, ax = plt.subplots(figsize=(8, 12))
im = ax.imshow([error_table['ID'], ece_table['ID'], nll_table['ID']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(3))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['Error','ECE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(3):
    for j in range(len(titles)):
        text = ax.text(j, i, [error_table['ID'].tolist(), ece_table['ID'].tolist(), nll_table['ID'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.01, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/id_classification_tab_top.pdf', bbox_inches='tight')
plt.show()
assert len(results_dict_groups['classification_wine'].keys()) - 1 == len(error_table['Method'].tolist())
overall_ranks_df = pd.DataFrame({'Method': error_table['Method'], 'ID': error_table['ID'] + ece_table['ID'] + nll_table['ID']})
overall_ranks_df


In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["classification_wine", "classification_toxicity", "classification_abalone", "classification_students", "classification_adult"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

error_table = print_table_multiple_datasets(results_dict_groups, metric="error")
ece_table = print_table_multiple_datasets(results_dict_groups, metric="ece")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["newsgroup-global_pooling_cnn", "sst-global_pooling_cnn"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
error_table = print_rank_table_multiple_datasets(results_dict_groups, metric="error")
ece_table = print_rank_table_multiple_datasets(results_dict_groups, metric="ece")
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll")


experiment_paths_groups_t = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["newsgroup-transformer", "sst-transformer"]]

results_dict_groups_t = load_results_multiple_datasets(experiment_paths_groups_t, root_dir="result_summaries")
error_table_t = print_rank_table_multiple_datasets(results_dict_groups_t, metric="error")
ece_table_t = print_rank_table_multiple_datasets(results_dict_groups_t, metric="ece")
nll_table_t = print_rank_table_multiple_datasets(results_dict_groups_t, metric="nll")

# assert the methods are the same
assert error_table['Method'].tolist() == error_table_t['Method'].tolist()

error_table['ID'] = (error_table['ID'] + error_table_t['ID']) / 2
error_table['OOD'] = (error_table['OOD'] + error_table_t['OOD']) / 2

ece_table['ID'] = (ece_table['ID'] + ece_table_t['ID']) / 2
ece_table['OOD'] = (ece_table['OOD'] + ece_table_t['OOD']) / 2

nll_table['ID'] = (nll_table['ID'] + nll_table_t['ID']) / 2
nll_table['OOD'] = (nll_table['OOD'] + nll_table_t['OOD']) / 2



titles = []
for title in error_table['Method'].tolist():
    titles.append(title)

# create heatmap with the titles as columns and errors, eces and nlls as rows
fig, ax = plt.subplots(figsize=(8, 12))
im = ax.imshow([error_table['ID'], ece_table['ID'], nll_table['ID']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(3))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['Error','ECE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(3):
    for j in range(len(titles)):
        error_table_id_list = ["{:.1f}".format(e) for e in error_table['ID'].tolist()]
        ece_table_id_list = ["{:.1f}".format(e) for e in ece_table['ID'].tolist()]
        nll_table_id_list = ["{:.1f}".format(e) for e in nll_table['ID'].tolist()]
        text = ax.text(j, i, [error_table_id_list, ece_table_id_list, nll_table_id_list][i][j],
                       ha="center", va="center", color="black")
        
# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.01, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/id_classification_nlp_top.pdf', bbox_inches='tight')
plt.show()
overall_ranks_df = pd.DataFrame({'Method': error_table['Method'], 'ID': error_table['ID'] + ece_table['ID'] + nll_table['ID']})
overall_ranks_df

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["newsgroup-transformer", "sst-transformer"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

error_table = print_table_multiple_datasets(results_dict_groups, metric="error")
ece_table = print_table_multiple_datasets(results_dict_groups, metric="ece")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["newsgroup-global_pooling_cnn", "sst-global_pooling_cnn"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

error_table = print_table_multiple_datasets(results_dict_groups, metric="error")
ece_table = print_table_multiple_datasets(results_dict_groups, metric="ece")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["rotated_cifar100", "wiki_face"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
mse_table = print_rank_table_multiple_datasets(results_dict_groups, metric="mse", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in mse_table['Method'].tolist():
    titles.append(title)


fig, ax = plt.subplots(figsize=(8, 12))
im = ax.imshow([mse_table['ID'], nll_table['ID']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(2))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['MSE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(2):
    for j in range(len(titles)):
        text = ax.text(j, i, [mse_table['ID'].tolist(), nll_table['ID'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.007, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/id_regression_cv_top.pdf', bbox_inches='tight')
plt.show()

overall_ranks_df = pd.DataFrame({'Method': mse_table['Method'], 'ID': mse_table['ID'] + nll_table['ID']})
overall_ranks_df


In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["rotated_cifar100", "wiki_face"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

mse_table = print_table_multiple_datasets(results_dict_groups, metric="mse", perc=False)
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll", perc=False)

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["regression_energy", "regression_boston", "regression_wine", "regression_yacht", "regression_concrete"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
mse_table = print_rank_table_multiple_datasets(results_dict_groups, metric="mse", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in mse_table['Method'].tolist():
    titles.append(title)

fig, ax = plt.subplots(figsize=(8, 12))
im = ax.imshow([mse_table['ID'], nll_table['ID']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(2))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['MSE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(2):
    for j in range(len(titles)):
        text = ax.text(j, i, [mse_table['ID'].tolist(), nll_table['ID'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.008, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/id_regression_tab_top.pdf', bbox_inches='tight')
plt.show()

overall_ranks_df = pd.DataFrame({'Method': mse_table['Method'], 'ID': mse_table['ID'] + nll_table['ID']})
overall_ranks_df


In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["regression_energy", "regression_boston", "regression_wine", "regression_yacht", "regression_concrete"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

mse_table = print_table_multiple_datasets(results_dict_groups, metric="mse")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")

OOD analysis

In [None]:
result_summary_names = os.listdir("result_summaries")
result_summary_names = [name for name in result_summary_names if 'transfer' not in name]

# get the dataset names
dataset_names = set([name.split('-')[0] for name in result_summary_names])

dataset_experiments_dict = {}
for dataset_name in dataset_names:
    if dataset_name == "newsgroup" or dataset_name == "sst":
        experiments_list_cnn = []
        experiments_list_transformer = []
        # split it based on if the name is "newsgroup-global_pooling_cnn" or "newsgroup-transformer"
        for experiment_name in result_summary_names:
            if dataset_name + "-global_pooling_cnn" in experiment_name:
                experiments_list_cnn.append(experiment_name)
            elif dataset_name + "-transformer" in experiment_name:
                experiments_list_transformer.append(experiment_name)
        dataset_experiments_dict[dataset_name + "-global_pooling_cnn"] = experiments_list_cnn
        dataset_experiments_dict[dataset_name + "-transformer"] = experiments_list_transformer
    else:
        experiments_list = []
        for experiment_name in result_summary_names:
            if dataset_name == experiment_name.split('-')[0]:
                experiments_list.append(experiment_name)
        dataset_experiments_dict[dataset_name] = experiments_list

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["svhn", "cifar10", "cifar100", "tinyimagenet"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
error_table = print_rank_table_multiple_datasets(results_dict_groups, metric="error", no_print=True)
ece_table = print_rank_table_multiple_datasets(results_dict_groups, metric="ece", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in error_table['Method'].tolist():
    titles.append(title)

# create heatmap with the titles as columns and errors, eces and nlls as rows
fig, ax = plt.subplots(figsize=(8, 12))
im = ax.imshow([error_table['OOD'], ece_table['OOD'], nll_table['OOD']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(3))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['Error','ECE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(3):
    for j in range(len(titles)):
        text = ax.text(j, i, [error_table['OOD'].tolist(), ece_table['OOD'].tolist(), nll_table['OOD'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.009, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/ood_classification_cv_top.pdf', bbox_inches='tight')
plt.show()
assert len(results_dict_groups['svhn'].keys()) - 1 == len(error_table['Method'].tolist())
overall_ranks_df = pd.DataFrame({'Method': error_table['Method'], 'OOD': error_table['OOD'] + ece_table['OOD'] + nll_table['OOD']})
overall_ranks_df



In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["svhn", "cifar10", "cifar100", "tinyimagenet"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

error_pd = print_rank_table_multiple_datasets(results_dict_groups, metric="error", no_print=True, raw_ranks=True)
ece_pd = print_rank_table_multiple_datasets(results_dict_groups, metric="ece", no_print=True, raw_ranks=True)
nll_pd = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True, raw_ranks=True)

print('Error')
dataset_names = error_pd.columns[1:]
methods = error_pd["Method"].values
# filter the dataset names
dataset_names = [x.split(":")[0] for x in dataset_names]
# remove duplicities but keep the order
dataset_names = list(dict.fromkeys(dataset_names))

tau_list = []
for dataset_name in dataset_names:
    id_error = error_pd[f"{dataset_name}: ID"].values
    ood_error = error_pd[f"{dataset_name}: OOD"].values
    tau, p_value = kendalltau(id_error, ood_error)
    print(f"{dataset_name}: {tau:.3f} ({p_value:.3f})")
    tau_list.append(tau)
print(f"Average: {np.mean(tau_list):.3f}")

# then ECE
print('ECE')
dataset_names = ece_pd.columns[1:]
methods = ece_pd["Method"].values
# filter the dataset names
dataset_names = [x.split(":")[0] for x in dataset_names]
# remove duplicities but keep the order
dataset_names = list(dict.fromkeys(dataset_names))

tau_list = []
for dataset_name in dataset_names:
    id_ece = ece_pd[f"{dataset_name}: ID"].values
    ood_ece = ece_pd[f"{dataset_name}: OOD"].values
    tau, p_value = kendalltau(id_ece, ood_ece)
    print(f"{dataset_name}: {tau:.3f} ({p_value:.3f})")
    tau_list.append(tau)
print(f"Average: {np.mean(tau_list):.3f}")

# then NLL
print('NLL')
dataset_names = nll_pd.columns[1:]
methods = nll_pd["Method"].values
# filter the dataset names
dataset_names = [x.split(":")[0] for x in dataset_names]
# remove duplicities but keep the order
dataset_names = list(dict.fromkeys(dataset_names))

tau_list = []
for dataset_name in dataset_names:
    id_nll = nll_pd[f"{dataset_name}: ID"].values
    ood_nll = nll_pd[f"{dataset_name}: OOD"].values
    tau, p_value = kendalltau(id_nll, ood_nll)
    print(f"{dataset_name}: {tau:.3f} ({p_value:.3f})")
    tau_list.append(tau)
print(f"Average: {np.mean(tau_list):.3f}")

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["classification_wine", "classification_toxicity", "classification_abalone", "classification_students", "classification_adult"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

error_table = print_rank_table_multiple_datasets(results_dict_groups, metric="error", no_print=True)
ece_table = print_rank_table_multiple_datasets(results_dict_groups, metric="ece", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in error_table['Method'].tolist():
    titles.append(title)

# create heatmap with the titles as columns and errors, eces and nlls as rows
fig, ax = plt.subplots(figsize=(8, 12))
im = ax.imshow([error_table['OOD'], ece_table['OOD'], nll_table['OOD']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(3))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['Error','ECE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(3):
    for j in range(len(titles)):
        text = ax.text(j, i, [error_table['OOD'].tolist(), ece_table['OOD'].tolist(), nll_table['OOD'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.01, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/ood_classification_tab_top.pdf', bbox_inches='tight')
plt.show()
assert len(results_dict_groups['classification_wine'].keys()) - 1 == len(error_table['Method'].tolist())
overall_ranks_df = pd.DataFrame({'Method': error_table['Method'], 'OOD': error_table['OOD'] + ece_table['OOD'] + nll_table['OOD']})
overall_ranks_df


In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["classification_wine", "classification_toxicity", "classification_abalone", "classification_students", "classification_adult"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

error_pd = print_rank_table_multiple_datasets(results_dict_groups, metric="error", no_print=True, raw_ranks=True)
ece_pd = print_rank_table_multiple_datasets(results_dict_groups, metric="ece", no_print=True, raw_ranks=True)
nll_pd = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True, raw_ranks=True)

print('Error')
dataset_names = error_pd.columns[1:]
methods = error_pd["Method"].values
# filter the dataset names
dataset_names = [x.split(":")[0] for x in dataset_names]
# remove duplicities but keep the order
dataset_names = list(dict.fromkeys(dataset_names))

tau_list = []
for dataset_name in dataset_names:
    id_error = error_pd[f"{dataset_name}: ID"].values
    ood_error = error_pd[f"{dataset_name}: OOD"].values
    tau, p_value = kendalltau(id_error, ood_error)
    print(f"{dataset_name}: {tau:.3f} ({p_value:.3f})")
    tau_list.append(tau)
print(f"Average: {np.mean(tau_list):.3f}")

# then ECE
print('ECE')
dataset_names = ece_pd.columns[1:]
methods = ece_pd["Method"].values
# filter the dataset names
dataset_names = [x.split(":")[0] for x in dataset_names]
# remove duplicities but keep the order
dataset_names = list(dict.fromkeys(dataset_names))

tau_list = []
for dataset_name in dataset_names:
    id_ece = ece_pd[f"{dataset_name}: ID"].values
    ood_ece = ece_pd[f"{dataset_name}: OOD"].values
    tau, p_value = kendalltau(id_ece, ood_ece)
    print(f"{dataset_name}: {tau:.3f} ({p_value:.3f})")
    tau_list.append(tau)
print(f"Average: {np.mean(tau_list):.3f}")

# then NLL
print('NLL')
dataset_names = nll_pd.columns[1:]
methods = nll_pd["Method"].values
# filter the dataset names
dataset_names = [x.split(":")[0] for x in dataset_names]
# remove duplicities but keep the order
dataset_names = list(dict.fromkeys(dataset_names))

tau_list = []
for dataset_name in dataset_names:
    id_nll = nll_pd[f"{dataset_name}: ID"].values
    ood_nll = nll_pd[f"{dataset_name}: OOD"].values
    tau, p_value = kendalltau(id_nll, ood_nll)
    print(f"{dataset_name}: {tau:.3f} ({p_value:.3f})")
    tau_list.append(tau)
print(f"Average: {np.mean(tau_list):.3f}")

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["rotated_cifar100", "wiki_face"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
mse_table = print_rank_table_multiple_datasets(results_dict_groups, metric="mse", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in mse_table['Method'].tolist():
    titles.append(title)


fig, ax = plt.subplots(figsize=(8, 12))
im = ax.imshow([mse_table['OOD'], nll_table['OOD']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(2))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['MSE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(2):
    for j in range(len(titles)):
        text = ax.text(j, i, [mse_table['OOD'].tolist(), nll_table['OOD'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.007, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/ood_regression_cv_top.pdf', bbox_inches='tight')
plt.show()

overall_ranks_df = pd.DataFrame({'Method': mse_table['Method'], 'OOD': mse_table['OOD'] + nll_table['OOD']})
overall_ranks_df


In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["rotated_cifar100", "wiki_face"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
mse_pd = print_rank_table_multiple_datasets(results_dict_groups, metric="mse", no_print=True, raw_ranks=True)
nll_pd = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True, raw_ranks=True)

# calculate kendalltau for each dataset between ID and OOD. Do this first for MSE and then for NLL
# the column names are Method and then many columns with dataset_name: ID or OOD

# first MSE
print('MSE')
dataset_names = mse_pd.columns[1:]
methods = mse_pd["Method"].values
# filter the dataset names
dataset_names = [x.split(":")[0] for x in dataset_names]
# remove duplicities but keep the order
dataset_names = list(dict.fromkeys(dataset_names))

tau_list = []
for dataset_name in dataset_names:
    id_mse = mse_pd[f"{dataset_name}: ID"].values
    ood_mse = mse_pd[f"{dataset_name}: OOD"].values
    tau, p_value = kendalltau(id_mse, ood_mse)
    print(f"{dataset_name}: {tau:.3f} ({p_value:.3f})")
    tau_list.append(tau)
print(f"Average: {np.mean(tau_list):.3f}")

# then NLL
print('NLL')
dataset_names = nll_pd.columns[1:]
methods = nll_pd["Method"].values
# filter the dataset names
dataset_names = [x.split(":")[0] for x in dataset_names]
# remove duplicities but keep the order
dataset_names = list(dict.fromkeys(dataset_names))

tau_list = []
for dataset_name in dataset_names:
    id_nll = nll_pd[f"{dataset_name}: ID"].values
    ood_nll = nll_pd[f"{dataset_name}: OOD"].values
    tau, p_value = kendalltau(id_nll, ood_nll)
    print(f"{dataset_name}: {tau:.3f} ({p_value:.3f})")
    tau_list.append(tau)
print(f"Average: {np.mean(tau_list):.3f}")

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["regression_energy", "regression_boston", "regression_wine", "regression_yacht", "regression_concrete"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
mse_table = print_rank_table_multiple_datasets(results_dict_groups, metric="mse", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in mse_table['Method'].tolist():
    titles.append(title)

fig, ax = plt.subplots(figsize=(8, 12))
im = ax.imshow([mse_table['OOD'], nll_table['OOD']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(2))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['MSE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(2):
    for j in range(len(titles)):
        text = ax.text(j, i, [mse_table['OOD'].tolist(), nll_table['OOD'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.008, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/ood_regression_tab_top.pdf', bbox_inches='tight')
plt.show()

overall_ranks_df = pd.DataFrame({'Method': mse_table['Method'], 'OOD': mse_table['OOD'] + nll_table['OOD']})
overall_ranks_df


In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["regression_energy", "regression_boston", "regression_wine", "regression_yacht", "regression_concrete"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
mse_pd = print_rank_table_multiple_datasets(results_dict_groups, metric="mse", no_print=True, raw_ranks=True)
nll_pd = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True, raw_ranks=True)

# calculate kendalltau for each dataset between ID and OOD. Do this first for MSE and then for NLL
# the column names are Method and then many columns with dataset_name: ID or OOD

# first MSE
print('MSE')
dataset_names = mse_pd.columns[1:]
methods = mse_pd["Method"].values
# filter the dataset names
dataset_names = [x.split(":")[0] for x in dataset_names]
# remove duplicities but keep the order
dataset_names = list(dict.fromkeys(dataset_names))

tau_list = []
for dataset_name in dataset_names:
    id_mse = mse_pd[f"{dataset_name}: ID"].values
    ood_mse = mse_pd[f"{dataset_name}: OOD"].values
    tau, p_value = kendalltau(id_mse, ood_mse)
    print(f"{dataset_name}: {tau:.3f} ({p_value:.3f})")
    tau_list.append(tau)
print(f"Average: {np.mean(tau_list):.3f}")

# then NLL
print('NLL')
dataset_names = nll_pd.columns[1:]
methods = nll_pd["Method"].values
# filter the dataset names
dataset_names = [x.split(":")[0] for x in dataset_names]
# remove duplicities but keep the order
dataset_names = list(dict.fromkeys(dataset_names))

tau_list = []
for dataset_name in dataset_names:
    id_nll = nll_pd[f"{dataset_name}: ID"].values
    ood_nll = nll_pd[f"{dataset_name}: OOD"].values
    tau, p_value = kendalltau(id_nll, ood_nll)
    print(f"{dataset_name}: {tau:.3f} ({p_value:.3f})")
    tau_list.append(tau)
print(f"Average: {np.mean(tau_list):.3f}")

Dataset and architecture transfer

Dataset transfer

In [None]:
result_summary_names = os.listdir("result_summaries")
result_summary_names = [name for name in result_summary_names if 'dataset-transfer' in name]

# get the dataset names
dataset_names = set([name.split('-')[0] for name in result_summary_names])

dataset_experiments_dict = {}
for dataset_name in dataset_names:
    experiments_list = []
    for experiment_name in result_summary_names:
        if dataset_name == experiment_name.split('-')[0]:
            experiments_list.append(experiment_name)
    dataset_experiments_dict[dataset_name] = experiments_list

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["cifar10", "cifar100", "tinyimagenet"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

error_table = print_rank_table_multiple_datasets(results_dict_groups, metric="error", no_print=True)
ece_table = print_rank_table_multiple_datasets(results_dict_groups, metric="ece", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in error_table['Method'].tolist():
    titles.append(title)

# create heatmap with the titles as columns and errors, eces and nlls as rows
fig, ax = plt.subplots(figsize=(6, 10))
im = ax.imshow([error_table['ID'], ece_table['ID'], nll_table['ID']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(3))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['Error','ECE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(3):
    for j in range(len(titles)):
        text = ax.text(j, i, [error_table['ID'].tolist(), ece_table['ID'].tolist(), nll_table['ID'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.012, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/id_classification_cv_dataset_transfer.pdf', bbox_inches='tight')
plt.show()
assert len(results_dict_groups['cifar100'].keys()) - 1 == len(error_table['Method'].tolist())
overall_ranks_df = pd.DataFrame({'Method': error_table['Method'], 'ID': error_table['ID'] + ece_table['ID'] + nll_table['ID']})

error_table = print_table_multiple_datasets(results_dict_groups, metric="error")
ece_table = print_table_multiple_datasets(results_dict_groups, metric="ece")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["regression_energy", "regression_wine", "regression_concrete"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
mse_table = print_rank_table_multiple_datasets(results_dict_groups, metric="mse", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in mse_table['Method'].tolist():
    titles.append(title)

fig, ax = plt.subplots(figsize=(6, 10))
im = ax.imshow([mse_table['ID'], nll_table['ID']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(2))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['MSE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(2):
    for j in range(len(titles)):
        text = ax.text(j, i, [mse_table['ID'].tolist(), nll_table['ID'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.012, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/id_regression_tab_dataset_transfer.pdf', bbox_inches='tight')
plt.show()

overall_ranks_df = pd.DataFrame({'Method': mse_table['Method'], 'ID': mse_table['ID'] + nll_table['ID']})
mse_table = print_table_multiple_datasets(results_dict_groups, metric="mse")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")


In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["cifar10", "cifar100", "tinyimagenet"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

error_table = print_rank_table_multiple_datasets(results_dict_groups, metric="error", no_print=True)
ece_table = print_rank_table_multiple_datasets(results_dict_groups, metric="ece", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in error_table['Method'].tolist():
    titles.append(title)

# create heatmap with the titles as columns and errors, eces and nlls as rows
fig, ax = plt.subplots(figsize=(6, 10))
im = ax.imshow([error_table['OOD'], ece_table['OOD'], nll_table['OOD']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(3))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['Error','ECE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(3):
    for j in range(len(titles)):
        text = ax.text(j, i, [error_table['OOD'].tolist(), ece_table['OOD'].tolist(), nll_table['OOD'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.012, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/ood_classification_cv_dataset_transfer.pdf', bbox_inches='tight')
plt.show()
assert len(results_dict_groups['cifar100'].keys()) - 1 == len(error_table['Method'].tolist())
overall_ranks_df = pd.DataFrame({'Method': error_table['Method'], 'OOD': error_table['OOD'] + ece_table['OOD'] + nll_table['OOD']})
error_table = print_table_multiple_datasets(results_dict_groups, metric="error")
ece_table = print_table_multiple_datasets(results_dict_groups, metric="ece")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["regression_energy", "regression_wine", "regression_concrete"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
mse_table = print_rank_table_multiple_datasets(results_dict_groups, metric="mse", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in mse_table['Method'].tolist():
    titles.append(title)

fig, ax = plt.subplots(figsize=(6, 10))
im = ax.imshow([mse_table['OOD'], nll_table['OOD']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(2))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['MSE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(2):
    for j in range(len(titles)):
        text = ax.text(j, i, [mse_table['OOD'].tolist(), nll_table['OOD'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.012, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/ood_regression_tab_dataset_transfer.pdf', bbox_inches='tight')
plt.show()

overall_ranks_df = pd.DataFrame({'Method': mse_table['Method'], 'OOD': mse_table['OOD'] + nll_table['OOD']})
mse_table = print_table_multiple_datasets(results_dict_groups, metric="mse")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")


Architecture transfer

In [None]:
result_summary_names = os.listdir("result_summaries")
result_summary_names = [name for name in result_summary_names if 'architecture-transfer' in name]

# get the dataset names
dataset_names = set([name.split('-')[0] for name in result_summary_names])

dataset_experiments_dict = {}
for dataset_name in dataset_names:
    experiments_list = []
    for experiment_name in result_summary_names:
        if dataset_name == experiment_name.split('-')[0]:
            experiments_list.append(experiment_name)
    dataset_experiments_dict[dataset_name] = experiments_list

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["svhn", "cifar100", "tinyimagenet"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

error_table = print_rank_table_multiple_datasets(results_dict_groups, metric="error", no_print=True)
ece_table = print_rank_table_multiple_datasets(results_dict_groups, metric="ece", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in error_table['Method'].tolist():
    titles.append(title)

# create heatmap with the titles as columns and errors, eces and nlls as rows
fig, ax = plt.subplots(figsize=(6, 10))
im = ax.imshow([error_table['ID'], ece_table['ID'], nll_table['ID']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(3))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['Error','ECE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(3):
    for j in range(len(titles)):
        text = ax.text(j, i, [error_table['ID'].tolist(), ece_table['ID'].tolist(), nll_table['ID'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.013, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/id_classification_cv_arch_transfer.pdf', bbox_inches='tight')
plt.show()
assert len(results_dict_groups['cifar100'].keys()) - 1 == len(error_table['Method'].tolist())
overall_ranks_df = pd.DataFrame({'Method': error_table['Method'], 'ID': error_table['ID'] + ece_table['ID'] + nll_table['ID']})
error_table = print_table_multiple_datasets(results_dict_groups, metric="error")
ece_table = print_table_multiple_datasets(results_dict_groups, metric="ece")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")

In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["regression_concrete", "regression_boston", "regression_yacht"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
mse_table = print_rank_table_multiple_datasets(results_dict_groups, metric="mse", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in mse_table['Method'].tolist():
    titles.append(title)

fig, ax = plt.subplots(figsize=(6, 10))
im = ax.imshow([mse_table['ID'], nll_table['ID']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(2))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['MSE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(2):
    for j in range(len(titles)):
        text = ax.text(j, i, [mse_table['ID'].tolist(), nll_table['ID'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.012, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/id_regression_tab_arch_transfer.pdf', bbox_inches='tight')
plt.show()

overall_ranks_df = pd.DataFrame({'Method': mse_table['Method'], 'ID': mse_table['ID'] + nll_table['ID']})
mse_table = print_table_multiple_datasets(results_dict_groups, metric="mse")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")


In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["svhn", "cifar100", "tinyimagenet"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

error_table = print_rank_table_multiple_datasets(results_dict_groups, metric="error", no_print=True)
ece_table = print_rank_table_multiple_datasets(results_dict_groups, metric="ece", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in error_table['Method'].tolist():
    titles.append(title)

# create heatmap with the titles as columns and errors, eces and nlls as rows
fig, ax = plt.subplots(figsize=(6, 10))
im = ax.imshow([error_table['OOD'], ece_table['OOD'], nll_table['OOD']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(3))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['Error','ECE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(3):
    for j in range(len(titles)):
        text = ax.text(j, i, [error_table['OOD'].tolist(), ece_table['OOD'].tolist(), nll_table['OOD'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.013, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/ood_classification_cv_arch_transfer.pdf', bbox_inches='tight')
plt.show()
assert len(results_dict_groups['cifar100'].keys()) - 1 == len(error_table['Method'].tolist())
overall_ranks_df = pd.DataFrame({'Method': error_table['Method'], 'OOD': error_table['OOD'] + ece_table['OOD'] + nll_table['OOD']})
error_table = print_table_multiple_datasets(results_dict_groups, metric="error")
ece_table = print_table_multiple_datasets(results_dict_groups, metric="ece")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")


In [None]:
experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name]) for dataset_name in ["regression_concrete", "regression_boston", "regression_yacht"]]

results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")
mse_table = print_rank_table_multiple_datasets(results_dict_groups, metric="mse", no_print=True)
nll_table = print_rank_table_multiple_datasets(results_dict_groups, metric="nll", no_print=True)

titles = []
for title in mse_table['Method'].tolist():
    titles.append(title)

fig, ax = plt.subplots(figsize=(6, 10))
im = ax.imshow([mse_table['OOD'], nll_table['OOD']], cmap=my_gradient, vmin=-10.2, vmax=10.2)

# We want to show all ticks...
ax.set_xticks(np.arange(len(titles)))
ax.set_yticks(np.arange(2))
# ... and label them with the respective list entries
ax.set_xticklabels(titles)
ax.set_yticklabels(['MSE','NLL'])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(2):
    for j in range(len(titles)):
        text = ax.text(j, i, [mse_table['OOD'].tolist(), nll_table['OOD'].tolist()][i][j],
                       ha="center", va="center", color="black")

# make the colorbar same height as the heatmap
cbar = ax.figure.colorbar(im, ax=ax, fraction=0.012, pad=0.04)

fig.tight_layout()
plt.savefig('analysis_images/ood_regression_tab_arch_transfer.pdf', bbox_inches='tight')
plt.show()

overall_ranks_df = pd.DataFrame({'Method': mse_table['Method'], 'OOD': mse_table['OOD'] + nll_table['OOD']})
mse_table = print_table_multiple_datasets(results_dict_groups, metric="mse")
nll_table = print_table_multiple_datasets(results_dict_groups, metric="nll")

## Representative Datasets Visualizations

In [None]:
# despine plot from top and right and left - global setting
import matplotlib as mpl

mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False


In [None]:
def create_plot(dataset_experiments_dict, dataset_name, save_name, ood=False):
    colour_mapping = {
        "No Noise": "#4285f4",
        "Input Weak Aug.": "#ea4335",
        "Input Gaussian": "#fbbc04",
        "Input ODS": "#34a853",
        "Input AugMix": "#ff6d01",
        "Input-Target MixUp": "#46bdc6",
        "Input-Target CMixUp": "#46bdc6",
        "Label Smoothing": "#7baaf7",
        "Activation Gaussian": "#f07b72",
        "Activation Dropout": "#fcd04f",
        "Gradient Gaussian": "#71c287",
        "Model": "#ff994d",
        "Weight Gaussian": "#7ed1d7",
        "Weight DropConnect": "#b3cefb",
        "Top-2 Direct": "#f7b4ae",
        "Top-3 Direct": "#fde49b",
        "Top-2 Optimised": "#aedcba",
        "Top-3 Optimised": "#ffc599",
    }

    if "regression" in dataset_name or "wiki_face" in dataset_name or "rotated" in dataset_name:
        problem = "regression"
        metrics = ["mse", "nll"]
    else:
        problem = "classification"
        metrics = ["error", "ece"]

    if ood:
        split = "ood_test"
    else:
        split = "test"

    experiment_paths_groups = ["\n".join(dataset_experiments_dict[dataset_name])]
    results_dict_groups = load_results_multiple_datasets(experiment_paths_groups, root_dir="result_summaries")

    label_mapping = {
        'vanilla': 'No Noise',
        'target_smoothing': 'Label Smoothing',
        'input_ods': 'Input ODS',
        'input_augmix': 'Input AugMix',
        'input_target_mixup': 'Input-Target MixUp',
        'activation_dropout': 'Activation Dropout',
        'gradient_gaussian': 'Gradient Gaussian',
        'model_sp': 'Model',
        'top2direct': 'Top-2 Direct',
        'top3direct': 'Top-3 Direct', 
        'top2opt': 'Top-2 Optimised',
        'top3opt': 'Top-3 Optimised',
        'input_additive_gaussian': 'Input Gaussian',
        'input_target_cmixup': 'Input-Target CMixUp',
        'activation_additive_gaussian': 'Activation Gaussian',
        'input_random_crop_horizontal_flip': 'Input Weak Aug.',
        'weight_additive_gaussian': 'Weight Gaussian',
        'weight_dropconnect': 'Weight DropConnect',
        }

    ordered_labels = ['No Noise', 'Input Weak Aug.', 'Input Gaussian', 'Input ODS', 'Input AugMix', 'Input-Target MixUp', 'Input-Target CMixUp', 'Label Smoothing', 'Activation Gaussian', 'Activation Dropout', 'Gradient Gaussian', 'Model', 'Weight Gaussian', 'Weight DropConnect', 'Top-2 Direct', 'Top-3 Direct', 'Top-2 Optimised', 'Top-3 Optimised']
            
    relabeled_results = {}
    for key in results_dict_groups[dataset_name]:
        for label_key in label_mapping:
            if label_key in key:
                new_key = label_key
        relabeled_results[label_mapping[new_key]] = results_dict_groups[dataset_name][key]

    labels = []
    selected_colours = []
    for key in ordered_labels:
        if key in relabeled_results:
            labels.append(key)
            selected_colours.append(colour_mapping[key])

    results_per_metric = {metric: {} for metric in metrics}
    if problem == "classification":
        for label in labels:
            for metric in metrics:
                results_per_metric[metric][label] = float(relabeled_results[label][metric][split][0]) * 100
    else:
        for label in labels:
            for metric in metrics:
                results_per_metric[metric][label] = float(relabeled_results[label][metric][split][0])

    error_table = results_per_metric[metrics[0]]
    ece_table = results_per_metric[metrics[1]]

    # make a plot with two subplots - error first and then ece as next column
    plt.figure(figsize=(8, 3))
    plt.subplot(1, 2, 1)
    plt.bar(range(len(error_table)), list(error_table.values()), align='center', color=selected_colours)

    plt.xticks([])
    plt.yticks(fontsize=14)
    if problem == "classification":
        plt.ylabel("Error (%)", fontsize=16)
    else:
        plt.ylabel("MSE", fontsize=16)

    plt.subplot(1, 2, 2)
    plt.bar(range(len(ece_table)), list(ece_table.values()), align='center', color=selected_colours)

    plt.xticks([])
    plt.yticks(fontsize=14)
    if problem == "classification":
        plt.ylabel("ECE (%)", fontsize=16)
    else:
        plt.ylabel("NLL", fontsize=16)


    plt.tight_layout()
    
    plt.savefig("selected_performances/" + save_name + ".pdf")
    plt.show()

In [None]:
result_summary_names = os.listdir("result_summaries")
result_summary_names = [name for name in result_summary_names if 'transfer' not in name]

# get the dataset names
dataset_names = set([name.split('-')[0] for name in result_summary_names])

dataset_experiments_dict = {}
for dataset_name in dataset_names:
    if dataset_name == "newsgroup" or dataset_name == "sst":
        experiments_list_cnn = []
        experiments_list_transformer = []
        # split it based on if the name is "newsgroup-global_pooling_cnn" or "newsgroup-transformer"
        for experiment_name in result_summary_names:
            if dataset_name + "-global_pooling_cnn" in experiment_name:
                experiments_list_cnn.append(experiment_name)
            elif dataset_name + "-transformer" in experiment_name:
                experiments_list_transformer.append(experiment_name)
        dataset_experiments_dict[dataset_name + "-global_pooling_cnn"] = experiments_list_cnn
        dataset_experiments_dict[dataset_name + "-transformer"] = experiments_list_transformer
    else:
        experiments_list = []
        for experiment_name in result_summary_names:
            if dataset_name == experiment_name.split('-')[0]:
                experiments_list.append(experiment_name)
        dataset_experiments_dict[dataset_name] = experiments_list

In [None]:
create_plot(dataset_experiments_dict, "cifar10", "id_selection_c10", ood=False)

In [None]:
create_plot(dataset_experiments_dict, "cifar10", "ood_selection_c10", ood=True)

In [None]:
create_plot({'newsgroup': dataset_experiments_dict['newsgroup-global_pooling_cnn']}, "newsgroup", "id_selection_ng", ood=False)

In [None]:
create_plot(dataset_experiments_dict, "classification_adult", "id_selection_cls_adult", ood=False)

In [None]:
create_plot(dataset_experiments_dict, "classification_adult", "ood_selection_cls_adult", ood=True)

In [None]:
create_plot(dataset_experiments_dict, "wiki_face", "id_selection_wiki_face", ood=False)

In [None]:
create_plot(dataset_experiments_dict, "wiki_face", "ood_selection_wiki_face", ood=True)

In [None]:
create_plot(dataset_experiments_dict, "regression_yacht", "id_selection_regr_yacht", ood=False)

In [None]:
create_plot(dataset_experiments_dict, "regression_yacht", "ood_selection_regr_yacht", ood=True)