In [None]:
import collections
import matplotlib.pyplot as plt
import getpass
import glob
import itertools
import json
import numpy as np
import os
import openml
import pandas as pd
import scipy.stats

from plot_utils.style import style_dc
from plot_utils.common_plots import rank, average_plot
from plot_utils.common_tables import collect_data_for_final_table, do_wilcoxon_test
from plot_utils.common_loading import load_from_openml

import sys
sys.path.append("/home/eggenspk/Work/Project/2020_PoSH_Autosklearn/2020_IEEE_Autosklearn_experiments/experiment_scripts/")
sys.path.append("/home/feurerm/sync_dir/projects/2020_posh/2020_IEEE_Autosklearn_experiments/experiment_scripts")
from utils import openml_automl_benchmark, get_normalization_constants

In [None]:
username = getpass.getuser()
dir_ = {
    'eggenspk': "/media/eggenspk/04a9389c-b7e2-474a-a9de-c66d5345f407/2020_posh/",
    'feurerm': "/home/feurerm/projects/2020_posh/",
}[username]

valid_pretty = {
    10: {
    ('auto', "port", "10MIN/RQ2.3_AutoAuto_simulate/dynamic/60/autoauto"): "Selector (PORT)",
    ('auto', "port", "10MIN/RQ2.3_AutoAuto_simulate/dynamic-no-fallback/60/autoauto"): "Selector (PORT, no fallback)",
    ('auto', "port", "10MIN/RQ2.3_AutoAuto_simulate/static/60/autoauto"): "Single best (PORT)",
    ('auto', "port+BO", "10MIN/AutoAuto_simulate_RQ1_target_dir/dynamic/60/autoauto"): "Selector (PORT+BO)",
    ('auto', "port+BO", "10MIN/AutoAuto_simulate/dynamic-no-fallback/60/autoauto"): "Selector (PORT+BO, no fallback)",
    ('auto', "port+BO", "10MIN/AutoAuto_simulate_RQ1_target_dir/static/60/autoauto"): "Single best (PORT+BO)",
    ('auto', "port+BO-ens", "10MIN/RQ1_AutoAuto_simulate/dynamic/60/autoauto"): "Selector (PORT+BO+ENS)",
    ('auto', "port+BO-ens", "10MIN/RQ1_AutoAuto_simulate/static/60/autoauto"): "Single best (PORT+BO+ENS)",
    ('auto', "port+BO-ens", "10MIN/RQ1_AutoAuto_simulate/dynamic-no-fallback/60/autoauto"): "Selector (PORT+BO+ENS, no fallback)",
    
    (None, None, "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_None_holdout_iterative_es_if"): "holdout",
    (None, None, "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_SH-eta4-i_holdout_iterative_es_if"): "SH; holdout",
    (None, None, "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_None_3CV_iterative_es_if"): "3CV",
    (None, None, "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_SH-eta4-i_3CV_iterative_es_if"): "SH; 3CV",
    (None, None, "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_None_5CV_iterative_es_if"): "5CV",
    (None, None, "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_SH-eta4-i_5CV_iterative_es_if"): "SH; 5CV",
    (None, None, "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_None_10CV_iterative_es_if"): "10CV",
    (None, None, "10MIN/ASKL_run_with_portfolio_w_ensemble/60/RF/RF_SH-eta4-i_10CV_iterative_es_if"): "SH; 10CV",
    },
    60: {
    ('auto', "port", "60MIN/RQ2.3_AutoAuto_simulate/dynamic/360/autoauto"): "Selector (PORT)",
    ('auto', "port", "60MIN/RQ2.3_AutoAuto_simulate/dynamic-no-fallback/360/autoauto"): "Selector (PORT, no fallback)",
    ('auto', "port", "60MIN/RQ2.3_AutoAuto_simulate/static/360/autoauto"): "Single best (PORT)",
    ('auto', "port+BO", "60MIN/AutoAuto_simulate_RQ1_target_dir/dynamic/360/autoauto"): "Selector (PORT+BO)",
    ('auto', "port+BO", "60MIN/AutoAuto_simulate/dynamic-no-fallback/360/autoauto"): "Selector (PORT+BO, no fallback)",
    ('auto', "port+BO", "60MIN/AutoAuto_simulate_RQ1_target_dir/static/360/autoauto"): "Single best (PORT+BO)",
    ('auto', "port+BO-ens", "60MIN/RQ1_AutoAuto_simulate/dynamic/360/autoauto"): "Selector (PORT+BO+ENS)",
    ('auto', "port+BO-ens", "60MIN/RQ1_AutoAuto_simulate/static/360/autoauto"): "Single best (PORT+BO+ENS)",
    ('auto', "port+BO-ens", "60MIN/RQ1_AutoAuto_simulate/dynamic-no-fallback/360/autoauto"): "Selector (PORT+BO+ENS, no fallback)",

    (None, None, "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_None_holdout_iterative_es_if"): "holdout",
    (None, None, "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_SH-eta4-i_holdout_iterative_es_if"): "SH; holdout",
    (None, None, "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_None_3CV_iterative_es_if"): "3CV",
    (None, None, "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_SH-eta4-i_3CV_iterative_es_if"): "SH; 3CV",
    (None, None, "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_None_5CV_iterative_es_if"): "5CV",
    (None, None, "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_SH-eta4-i_5CV_iterative_es_if"): "SH; 5CV",
    (None, None, "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_None_10CV_iterative_es_if"): "10CV",
    (None, None, "60MIN/ASKL_run_with_portfolio_w_ensemble/360/RF/RF_SH-eta4-i_10CV_iterative_es_if"): "SH; 10CV",
    },
}

task_ids = openml_automl_benchmark
#for tid in [168794, 168796, 168797, 189866, 189873, 189874, 75193]:
#    try:
#        task_ids.remove(tid)
#    except:
#        pass
print(len(task_ids))

res_dc = {}
miss = 0
for horizon in list(valid_pretty.keys()):
    res_dc[horizon] = {}
    for tid in task_ids:
        res_dc[horizon][tid] = {}
        for mode in list(valid_pretty[horizon].keys()):
            if type(mode) == tuple:
                auto, _, model_name = mode
            else:
                model_name = mode
                auto = None
            res_dc[horizon][tid][mode] = []
            for seed in range(10):
                if auto == "auto":
                    fl_tmpl = dir_ + model_name + "_%d_%d/ensemble_results_0.000000thresh_50size_1.000000best" % (tid, seed)
                else:
                    fl_tmpl = dir_ + model_name + "_%d_%d_0_0/ensemble_results_0.000000thresh_50size_1.000000best" % (tid, seed)
                fl = glob.glob(fl_tmpl)
                if len(fl) == 0:
                    if auto == "auto":
                        fl_tmpl = dir_ + model_name + "_%d_%d/result.json" % (tid, seed)
                    else:
                        fl_tmpl = dir_ + model_name + "_%d_%d_0_0/result.json" % (tid, seed)
                    fl = glob.glob(fl_tmpl) 
                    if len(fl) == 0:
                        miss += 1
                        print(fl_tmpl)
                        continue
                    else:
                        print(tid,  '### Ensemble missing, falling back to regular', fl[0])
                fl = fl[0]
                with open(fl, "r") as fh:
                    line = json.load(fh)
                    if "50" in line:
                        loss = line["50"]["trajectory"]
                    else:
                        loss = line["0"]["trajectory"]
                    loss = [(l[0], l[1]) for l in loss]
                    loss = dict(loss)
                    res_dc[horizon][tid][mode].append(loss)
print("Missing %d entries" % miss)

In [None]:
# Artificially add oracle and random
for horizon in res_dc:
    if horizon == 600:
        continue
    rng = np.random.RandomState(1)
    for tid in task_ids:
        options = [k for k in res_dc[horizon][tid] if k[0] == None]
        # random
        rand_mode = ("random", "random")
        res_dc[horizon][tid][rand_mode] = []        
        choices = rng.choice(len(options), 10)
        for s, c in enumerate(choices):
            res_dc[horizon][tid][rand_mode].append(res_dc[horizon][tid][options[c]][s])
        
        # oracle
        orac_mode = ("oracle", "oracle")
        res_dc[horizon][tid][orac_mode] = []
        for s in range(10):
            vals_for_this_seed = []
            for o in options:
                tmp_key = sorted(list(res_dc[horizon][tid][o][s].keys()))[-1]
                tmp = res_dc[horizon][tid][o][s][tmp_key]
                vals_for_this_seed.append(tmp)
            best = np.argmin(vals_for_this_seed)
            res_dc[horizon][tid][orac_mode].append(res_dc[horizon][tid][options[best]][s])
    valid_pretty[horizon][orac_mode] = "oracle"
    valid_pretty[horizon][rand_mode] = "random"

In [None]:
# Load some stuff from disc and openml - takes some time
tasks, task_ids_sorted_by_num_features = load_from_openml(task_ids)
min_diff_dc = get_normalization_constants(dir_, load=True)
tasks

In [None]:
HORIZON = 60
for tid in task_ids_sorted_by_num_features:
    plt.figure(figsize=[16, 12])
    colors = itertools.cycle(style_dc["colors"] + ['yellow', 'grey'])
    for mode in list(valid_pretty[HORIZON].keys()):
        c = next(colors)
        tmp = pd.DataFrame(res_dc[HORIZON][tid][mode]).sort_index(axis=1).ffill(axis=1)
        med = tmp.median(axis=0)
        med.loc[HORIZON*60] = med.iloc[-1]
        low = tmp.quantile(0.25)
        low.loc[HORIZON*60] = low.iloc[-1]
        up = tmp.quantile(0.75, axis=0)
        up.loc[HORIZON*60] = up.iloc[-1]
        plt.plot(med.index, med.to_numpy(), 
                 label=valid_pretty[HORIZON][mode], linewidth=style_dc["linewidth"], color=c)
        plt.fill_between(med.index, low, up, alpha=0.3)
    plt.title('Name: %s (%d), #instances: %d, #attributes: %d' % (
        tasks.loc[tid, 'name'], tid, tasks.loc[tid, 'NumberOfInstances'], tasks.loc[tid, 'NumberOfFeatures']))
    plt.legend(fontsize=style_dc["fontsize"])
    plt.xticks(fontsize=style_dc["fontsize"])
    plt.yticks(fontsize=style_dc["fontsize"])
    plt.xlabel("runtime in seconds", fontsize=style_dc["fontsize"])
    plt.ylabel("balanced error rate", fontsize=style_dc["fontsize"])
    plt.ylim([plt.ylim()[0], plt.ylim()[0] + 0.3*(plt.ylim()[1]-plt.ylim()[0])])
    plt.yscale("log")
    plt.show()

In [None]:
# Plot average BER across all datasets
HORIZON = 60
model_list = []
for m in valid_pretty[HORIZON].keys():
    model_list.append(m)
average_plot(model_list=model_list, res_dc=res_dc, valid_pretty=valid_pretty,
             horizon=HORIZON, task_ids_sorted_by_num_features=task_ids_sorted_by_num_features,
             min_diff_dc=min_diff_dc, figsize=(20, 10))
plt.yscale("log")
plt.show()

In [None]:
# Assume we have either all or no models
horizon_list = sorted(list(valid_pretty.keys()))
model_list = {}
for h in horizon_list:
    model_list[h] = []
    for m in valid_pretty[h].keys():
        model_list[h].append(m)

tab_data, stat_test_data = collect_data_for_final_table(model_list, res_dc, valid_pretty, horizon_list,
                                                        task_ids_sorted_by_num_features, min_diff_dc)
not_different = do_wilcoxon_test(stat_test_data, model_list, horizon_list, valid_pretty, exclude=["oracle",])

In [None]:
df = pd.DataFrame(tab_data)
#for horizon in horizon_list:
#    df['Rank_%s' % horizon] = df[horizon].rank(method='average', ascending=True)
print(df.to_latex())
print(not_different)

In [None]:
HORIZON = 60
model_list = []
for m in valid_pretty[HORIZON].keys():
    if m[0] is None or "oracle" in m[1]:
        continue
    model_list.append(m)
rank(model_list, res_dc, valid_pretty, HORIZON, task_ids_sorted_by_num_features, n_iter=100, steplength=10,
    figsize=(20, 10))

In [None]:
def table_per_dataset(model_list, res_dc, valid_pretty, horizon,
                      task_ids_sorted_by_num_features, min_diff_dc):
    # Generate data for final table and statistical testing
    tab_data = {}

    for mode in model_list:
        assert mode in valid_pretty[horizon], (mode, valid_pretty[horizon].keys())
        tab_data[valid_pretty[horizon][mode]] = {}
        # Use label, not actual key
        # Get means per tid
        for tid in task_ids_sorted_by_num_features:
            tmp = pd.DataFrame(res_dc[horizon][tid][mode]).sort_index(axis=1).ffill(axis=1).iloc[:, -1]
            assert tmp.shape == (10, )
            tmp = (tmp - min_diff_dc[tid][0]) / min_diff_dc[tid][1]
            tab_data[valid_pretty[horizon][mode]][tid] = tmp.mean()
    tab_data = pd.DataFrame(tab_data)
    return tab_data

h = 10
horizon_list = sorted(list(valid_pretty.keys()))
model_list = []
for m in valid_pretty[h].keys():
    if m[1]:
        model_list.append(m)

tab_data = table_per_dataset(model_list, res_dc, valid_pretty, h,
                             task_ids_sorted_by_num_features, min_diff_dc)

In [None]:
print(tab_data[['Selector (PORT+BO+ENS)', 'Selector (PORT+BO+ENS, no fallback)']])

In [None]:
print(np.sum(tab_data['Selector (PORT)'] < tab_data['Selector (PORT+BO+ENS)']))
print(np.sum(tab_data['Selector (PORT)'] > tab_data['Selector (PORT+BO+ENS)']))
print(np.sum(tab_data['Single best (PORT+BO+ENS)'] > tab_data['Selector (PORT+BO+ENS)']))
print(np.sum(tab_data['Single best (PORT+BO+ENS)'] < tab_data['Selector (PORT+BO+ENS)']))
print(np.sum(tab_data['Selector (PORT+BO+ENS, no fallback)'] > tab_data['Selector (PORT+BO+ENS)']))
print(np.sum(tab_data['Selector (PORT+BO+ENS, no fallback)'] < tab_data['Selector (PORT+BO+ENS)']))