In [None]:
import collections
import matplotlib.pyplot as plt
import getpass
import glob
import itertools
import json
import numpy as np
import os
import openml
import pandas as pd
import scipy.stats

from plot_utils.style import style_dc
from plot_utils.common_plots import rank, average_plot
from plot_utils.common_tables import collect_data_for_final_table, do_wilcoxon_test
from plot_utils.common_loading import load_from_openml

import sys
sys.path.append("/home/eggenspk/Work/Project/2020_PoSH_Autosklearn/2020_IEEE_Autosklearn_experiments/experiment_scripts/")
sys.path.append("/home/feurerm/sync_dir/projects/2020_posh/2020_IEEE_Autosklearn_experiments/experiment_scripts")
from utils import openml_automl_benchmark, get_normalization_constants

In [None]:
username = getpass.getuser()
dir_ = {
    'eggenspk': "/media/eggenspk/04a9389c-b7e2-474a-a9de-c66d5345f407/2020_posh/",
    'feurerm': "/home/feurerm/projects/2020_posh/",
}[username]

valid_pretty = {
    10: {
    (None, "10MIN/RQ1_AutoAuto_simulate/dynamic/60/"): "All",
    (None, "10MIN/RQ2.1_AutoAuto_simulate/no_cv/dynamic/60/"): "Only Holdout",
    (None, "10MIN/RQ2.1_AutoAuto_simulate/only_cv/dynamic/60/"): "Only CV",
    (None, "10MIN/RQ2.1_AutoAuto_simulate/no_sh/dynamic/60/"): "Full budget",
    (None, "10MIN/RQ2.1_AutoAuto_simulate/only_sh/dynamic/60/"): "Only SH",
    (("holdout",), "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_None_holdout_iterative_es_if"): "holdout",
    (("holdout", "sh"), "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_SH-eta4-i_holdout_iterative_es_if"): "SH; holdout",
    (("3CV", ), "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_None_3CV_iterative_es_if"): "3CV",
    (("3CV", "sh"), "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_SH-eta4-i_3CV_iterative_es_if"): "SH; 3CV",
    (("5CV", ), "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_None_5CV_iterative_es_if"): "5CV",
    (("5CV", "sh"), "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_SH-eta4-i_5CV_iterative_es_if"): "SH; 5CV",
    (("10CV", ), "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_None_10CV_iterative_es_if"): "10CV",
    (("10CV", "sh"), "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_SH-eta4-i_10CV_iterative_es_if"): "SH; 10CV",
    },
    60: {
    (None, "60MIN/RQ1_AutoAuto_simulate/dynamic/360/"): "All",
    (None, "60MIN/RQ2.1_AutoAuto_simulate/no_cv/dynamic/360/"): "Only Holdout",
    (None, "60MIN/RQ2.1_AutoAuto_simulate/only_cv/dynamic/360/"): "Only CV",
    (None, "60MIN/RQ2.1_AutoAuto_simulate/no_sh/dynamic/360/"): "Full budget",
    (None, "60MIN/RQ2.1_AutoAuto_simulate/only_sh/dynamic/360/"): "Only SH",
    (("holdout",), "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_None_holdout_iterative_es_if"): "holdout",
    (("holdout", "sh"), "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_SH-eta4-i_holdout_iterative_es_if"): "SH; holdout",
    (("3CV", ), "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_None_3CV_iterative_es_if"): "3CV",
    (("3CV", "sh"), "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_SH-eta4-i_3CV_iterative_es_if"): "SH; 3CV",
    (("5CV", ), "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_None_5CV_iterative_es_if"): "5CV",
    (("5CV", "sh"), "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_SH-eta4-i_5CV_iterative_es_if"): "SH; 5CV",
    (("10CV", ), "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_None_10CV_iterative_es_if"): "10CV",
    (("10CV", "sh"), "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_SH-eta4-i_10CV_iterative_es_if"): "SH; 10CV",
    },
}

task_ids = openml_automl_benchmark

res_dc = {}
miss = 0
fallback = 0
for horizon in list(valid_pretty.keys()):
    res_dc[horizon] = {}
    for tid in task_ids:
        res_dc[horizon][tid] = {}
        for mode in list(valid_pretty[horizon].keys()):
            if type(mode) == tuple:
                auto, model_name = mode
            else:
                model_name = mode
                auto = None
            res_dc[horizon][tid][mode] = []
            for seed in range(10):
                # Ensemble performance
                if auto is None:
                    fl_tmpl = dir_ + model_name + "autoauto_%d_%d/ensemble_results_0.000000thresh_50size_1.000000best" % (tid, seed)
                else:
                    fl_tmpl = dir_ + model_name + "_%d_%d_0_0/ensemble_results_0.000000thresh_50size_1.000000best" % (tid, seed)
                fl = glob.glob(fl_tmpl)               
                if len(fl) == 0:
                    if auto is None:
                        fl_tmpl = dir_ + model_name + "autoauto_%d_%d/result.json" % (tid, seed)
                    else:
                        fl_tmpl = dir_ + model_name + "_%d_%d_0_0/result.json" % (tid, seed)
                    fl = glob.glob(fl_tmpl)
                    if len(fl) == 0:
                        miss += 1
                        print('Missing', fl_tmpl)
                        res_dc[horizon][tid][mode].append({0: 1})
                        continue
                    else:
                        fallback += 1
                        print(tid, 'Ensemble missing, falling back to regular', fl[0])
                fl = fl[0]
                with open(fl, "r") as fh:
                    line = json.load(fh)
                    if "50" in line:
                        loss = line["50"]["trajectory"]
                    else:
                        loss = line["0"]["trajectory"]
                    loss = [(l[0], l[1]) for l in loss]
                    loss = dict(loss)
                    res_dc[horizon][tid][mode].append(loss)
print("Missing %d entries" % miss)
print("Fallback %d entries" % fallback)

In [None]:
# Artificially add oracle and random
for horizon in res_dc:
    if horizon == 36000:
        continue
    rng = np.random.RandomState(1)
    for tid in task_ids:
        keys = list(valid_pretty[horizon].keys())
        for model in keys:
            sub, model_name = model
            nice_label = valid_pretty[horizon][model]
            if sub is None:
                if nice_label == "All":
                    options = [k for k in res_dc[horizon][tid] if k[0] != None 
                               and "artificial" not in k[0]
                              ]
                elif nice_label == "Only Holdout":
                    options = [k for k in res_dc[horizon][tid] if k[0] != None 
                               and "holdout" in k[0] 
                               and "artificial" not in k[0]
                              ]
                elif nice_label == "Only CV":
                    options = [k for k in res_dc[horizon][tid] if k[0] != None 
                               and ("3CV" in k[0] or "5CV" in k[0] or "10CV" in k[0])
                               and "artificial" not in k[0]
                              ]
                elif nice_label == "Full budget":
                    options = [k for k in res_dc[horizon][tid] if k[0] != None 
                               and "sh" not in k[0] 
                               and "artificial" not in k[0]
                              ]
                elif nice_label == "Only SH":
                    options = [k for k in res_dc[horizon][tid] if k[0] != None 
                               and "sh" in k[0] 
                               and "artificial" not in k[0]
                              ]
                else:
                    continue
            else:
                continue
            print(nice_label, [o[0] for o in options])
                
            assert len(options) > 1, (nice_label, options)
            # random
            #print(valid_pretty[horizon][model], [o[0] for o in options])
            rand_mode = ("artificial", "%s random" % nice_label)
            res_dc[horizon][tid][rand_mode] = []        
            choices = rng.choice(len(options), 10)
            for s, c in enumerate(choices):
                add = res_dc[horizon][tid][options[c]][s]
                res_dc[horizon][tid][rand_mode].append(add)

            # oracle
            orac_mode = ("artificial", "%s oracle" % nice_label)
            res_dc[horizon][tid][orac_mode] = []
            for s in range(10):
                vals_for_this_seed = []
                for o in options:
                    tmp_key = sorted(list(res_dc[horizon][tid][o][s].keys()))[-1]
                    tmp = res_dc[horizon][tid][o][s][tmp_key]
                    vals_for_this_seed.append(tmp)
                best = np.argmin(vals_for_this_seed)
                res_dc[horizon][tid][orac_mode].append(res_dc[horizon][tid][options[best]][s])
            valid_pretty[horizon][orac_mode] = "%s oracle" % nice_label
            valid_pretty[horizon][rand_mode] = "%s random" % nice_label


In [None]:
print(valid_pretty)

In [None]:
# Load some stuff from disc and openml - takes some time
tasks, task_ids_sorted_by_num_features = load_from_openml(task_ids)
min_diff_dc = get_normalization_constants(dir_, load=True)
tasks

In [None]:
HORIZON = 60
for tid in task_ids_sorted_by_num_features:
    plt.figure(figsize=[16,12])
    colors = itertools.cycle(style_dc["colors"])
    for mode in list(valid_pretty[HORIZON].keys()):
        c = next(colors)
        tmp = pd.DataFrame(res_dc[HORIZON][tid][mode]).sort_index(axis=1).ffill(axis=1)
        med = tmp.median(axis=0)
        med.loc[HORIZON*60] = med.iloc[-1]
        low = tmp.quantile(0.25)
        low.loc[HORIZON*60] = low.iloc[-1]
        up = tmp.quantile(0.75, axis=0)
        up.loc[HORIZON*60] = up.iloc[-1]
        plt.plot(med.index, med.to_numpy(), label=valid_pretty[HORIZON][mode], linewidth=3)
        plt.fill_between(med.index, low, up, alpha=0.3)
    plt.title('Name: %s (%d), #instances: %d, #attributes: %d' % (
        tasks.loc[tid, 'name'], tid, tasks.loc[tid, 'NumberOfInstances'], tasks.loc[tid, 'NumberOfFeatures']))
    plt.legend(fontsize=20)
    plt.ylim([plt.ylim()[0], plt.ylim()[0] + 0.3*(plt.ylim()[1]-plt.ylim()[0])])
    #plt.yscale("log")
    plt.show()

In [None]:
# Plot average BER across all datasets
HORIZON = 60
model_list = []
for m in valid_pretty[HORIZON].keys():
    if m[0] in ("artificial", ): #None):
        model_list.append(m)
average_plot(model_list=model_list, res_dc=res_dc, valid_pretty=valid_pretty,
             horizon=HORIZON, task_ids_sorted_by_num_features=task_ids_sorted_by_num_features,
             min_diff_dc=min_diff_dc)
plt.yscale("log")
plt.legend(fontsize=5)
plt.show()

In [None]:
# Assume we have either all or no models
horizon_list = sorted(list(valid_pretty.keys()))
model_list = {}
for h in horizon_list:
    model_list[h] = []
    for m in valid_pretty[h].keys():
        if m[0] in (None, "artificial"):
            model_list[h].append(m)
print(model_list)
tab_data, stat_test_data = collect_data_for_final_table(model_list, res_dc, valid_pretty, horizon_list,
                                                        task_ids_sorted_by_num_features, min_diff_dc)

In [None]:
# Do a test per setting
# ALL
not_different = do_wilcoxon_test(stat_test_data, model_list, horizon_list, valid_pretty, 
                                 exclude=[
                                     #(None, '60MIN/RQ1_AutoAuto_simulate/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/no_cv/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/only_cv/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/no_sh/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/only_sh/dynamic/360/'),
                                     ('holdout',), ('3CV',), ('5CV', ), ('10CV',),
                                     ('holdout', 'sh'), ('3CV', 'sh'),  ('5CV', 'sh'), ('10CV', 'sh'),
                                     #('artificial', 'All oracle'), ('artificial', 'All random'), 
                                     ('artificial', 'Only Holdout oracle'), ('artificial', 'Only Holdout random'),
                                     ('artificial', 'Only CV oracle'), ('artificial', 'Only CV random'),  
                                     ('artificial', 'Full budget oracle'), ('artificial', 'Full budget random'),
                                     ('artificial', 'Only SH oracle'), ('artificial', 'Only SH random'),
                                 ]
                                )
print("ALL")
print(not_different)
# Only holdout
not_different = do_wilcoxon_test(stat_test_data, model_list, horizon_list, valid_pretty, 
                                 exclude=[
                                     (None, '60MIN/RQ1_AutoAuto_simulate/dynamic/360/'),
                                     #(None, '60MIN/RQ2.1_AutoAuto_simulate/no_cv/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/only_cv/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/no_sh/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/only_sh/dynamic/360/'),
                                     ('holdout',), ('3CV',), ('5CV', ), ('10CV',),
                                     ('holdout', 'sh'), ('3CV', 'sh'),  ('5CV', 'sh'), ('10CV', 'sh'),
                                     ('artificial', 'All oracle'), ('artificial', 'All random'), 
                                     #('artificial', 'Only Holdout oracle'), ('artificial', 'Only Holdout random'),
                                     ('artificial', 'Only CV oracle'), ('artificial', 'Only CV random'),  
                                     ('artificial', 'Full budget oracle'), ('artificial', 'Full budget random'),
                                     ('artificial', 'Only SH oracle'), ('artificial', 'Only SH random'),
                                 ]
                                )
print("Only holdout")
print(not_different)
# Only CV
not_different = do_wilcoxon_test(stat_test_data, model_list, horizon_list, valid_pretty, 
                                 exclude=[
                                     (None, '60MIN/RQ1_AutoAuto_simulate/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/no_cv/dynamic/360/'),
                                     #(None, '60MIN/RQ2.1_AutoAuto_simulate/only_cv/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/no_sh/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/only_sh/dynamic/360/'),
                                     ('holdout',), ('3CV',), ('5CV', ), ('10CV',),
                                     ('holdout', 'sh'), ('3CV', 'sh'),  ('5CV', 'sh'), ('10CV', 'sh'),
                                     ('artificial', 'All oracle'), ('artificial', 'All random'), 
                                     ('artificial', 'Only Holdout oracle'), ('artificial', 'Only Holdout random'),
                                     #('artificial', 'Only CV oracle'), ('artificial', 'Only CV random'),  
                                     ('artificial', 'Full budget oracle'), ('artificial', 'Full budget random'),
                                     ('artificial', 'Only SH oracle'), ('artificial', 'Only SH random'),
                                 ]
                                )
print("Only CV")
print(not_different)
# Only SH
not_different = do_wilcoxon_test(stat_test_data, model_list, horizon_list, valid_pretty, 
                                 exclude=[
                                     (None, '60MIN/RQ1_AutoAuto_simulate/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/no_cv/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/only_cv/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/no_sh/dynamic/360/'),
                                     #(None, '60MIN/RQ2.1_AutoAuto_simulate/only_sh/dynamic/360/'),
                                     ('holdout',), ('3CV',), ('5CV', ), ('10CV',),
                                     ('holdout', 'sh'), ('3CV', 'sh'),  ('5CV', 'sh'), ('10CV', 'sh'),
                                     ('artificial', 'All oracle'), ('artificial', 'All random'), 
                                     ('artificial', 'Only Holdout oracle'), ('artificial', 'Only Holdout random'),
                                     ('artificial', 'Only CV oracle'), ('artificial', 'Only CV random'),  
                                     ('artificial', 'Full budget oracle'), ('artificial', 'Full budget random'),
                                     #('artificial', 'Only SH oracle'), ('artificial', 'Only SH random'),
                                 ]
                                )
print("Only SH")
print(not_different)
# Only Full Budget
not_different = do_wilcoxon_test(stat_test_data, model_list, horizon_list, valid_pretty, 
                                 exclude=[
                                     (None, '60MIN/RQ1_AutoAuto_simulate/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/no_cv/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/only_cv/dynamic/360/'),
                                     #(None, '60MIN/RQ2.1_AutoAuto_simulate/no_sh/dynamic/360/'),
                                     (None, '60MIN/RQ2.1_AutoAuto_simulate/only_sh/dynamic/360/'),
                                     ('holdout',), ('3CV',), ('5CV', ), ('10CV',),
                                     ('holdout', 'sh'), ('3CV', 'sh'),  ('5CV', 'sh'), ('10CV', 'sh'),
                                     ('artificial', 'All oracle'), ('artificial', 'All random'), 
                                     ('artificial', 'Only Holdout oracle'), ('artificial', 'Only Holdout random'),
                                     ('artificial', 'Only CV oracle'), ('artificial', 'Only CV random'),  
                                     #('artificial', 'Full budget oracle'), ('artificial', 'Full budget random'),
                                     ('artificial', 'Only SH oracle'), ('artificial', 'Only SH random'),
                                 ]
                                )
print("Only Full Budget")
print(not_different)

In [None]:
df = pd.DataFrame(tab_data)
#for horizon in horizon_list:
#    df['Rank_%s' % horizon] = df[horizon].rank(method='average', ascending=True)
print(df.to_latex())

In [None]:
HORIZON = 60
model_list = []
for m in valid_pretty[HORIZON].keys():
    if m[0] not in (None, "artificial") or "oracle" in m[1] or 'random' in m[1]:
        continue
    model_list.append(m)
print(model_list)
rank(model_list, res_dc, valid_pretty, HORIZON, task_ids_sorted_by_num_features, n_iter=200, steplength=5)