In [None]:
import collections
import matplotlib.pyplot as plt
import getpass
import glob
import itertools
import json
import numpy as np
import os
import openml
import pandas as pd
import scipy.stats

from plot_utils.style import style_dc
from plot_utils.common_plots import rank, average_plot
from plot_utils.common_tables import collect_data_for_final_table, do_wilcoxon_test
from plot_utils.common_loading import load_from_openml

import sys
sys.path.append("/home/eggenspk/Work/Project/2020_PoSH_Autosklearn/2020_IEEE_Autosklearn_experiments/experiment_scripts/")
sys.path.append("/home/feurerm/sync_dir/projects/2020_posh/2020_IEEE_Autosklearn_experiments/experiment_scripts")
from utils import openml_automl_benchmark, get_normalization_constants

In [None]:
username = getpass.getuser()
dir_ = {
    'eggenspk': "/media/eggenspk/04a9389c-b7e2-474a-a9de-c66d5345f407/2020_posh/",
    'feurerm': "/home/feurerm/projects/2020_posh/",
}[username]

valid_pretty = {
    10: {
    ('portfolio', "10MIN/ASKL_run_with_portfolio/60/RF/RF_None_holdout_iterative_es_if"): "holdout",
    ('portfolio', "10MIN/ASKL_run_with_portfolio/60/RF/RF_SH-eta4-i_holdout_iterative_es_if"): "SH; holdout",
    ('portfolio', "10MIN/ASKL_run_with_portfolio/60/RF/RF_None_3CV_iterative_es_if"): "3CV",
    ('portfolio', "10MIN/ASKL_run_with_portfolio/60/RF/RF_SH-eta4-i_3CV_iterative_es_if"): "SH; 3CV",
    ('portfolio', "10MIN/ASKL_run_with_portfolio/60/RF/RF_None_5CV_iterative_es_if"): "5CV",
    ('portfolio', "10MIN/ASKL_run_with_portfolio/60/RF/RF_SH-eta4-i_5CV_iterative_es_if"): "SH; 5CV",
    ('portfolio', "10MIN/ASKL_run_with_portfolio/60/RF/RF_None_10CV_iterative_es_if"): "10CV",
    ('portfolio', "10MIN/ASKL_run_with_portfolio/60/RF/RF_SH-eta4-i_10CV_iterative_es_if"): "SH; 10CV",
    ('bo', "10MIN/ASKL_automldata/RF/RF_None_holdout_iterative_es_if"): "holdout",
    ('bo', "10MIN/ASKL_automldata/RF/RF_SH-eta4-i_holdout_iterative_es_if"): "SH; holdout",
    ('bo', "10MIN/ASKL_automldata/RF/RF_None_3CV_iterative_es_if"): "3CV",
    ('bo', "10MIN/ASKL_automldata/RF/RF_SH-eta4-i_3CV_iterative_es_if"): "SH; 3CV",
    ('bo', "10MIN/ASKL_automldata/RF/RF_None_5CV_iterative_es_if"): "5CV",
    ('bo', "10MIN/ASKL_automldata/RF/RF_SH-eta4-i_5CV_iterative_es_if"): "SH; 5CV",
    ('bo', "10MIN/ASKL_automldata/RF/RF_None_10CV_iterative_es_if"): "10CV",
    ('bo', "10MIN/ASKL_automldata/RF/RF_SH-eta4-i_10CV_iterative_es_if"): "SH; 10CV",
    ('knd', "10MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_None_holdout_iterative_es_if"): "holdout",
    ('knd', "10MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_SH-eta4-i_holdout_iterative_es_if"): "SH; holdout",
    ('knd', "10MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_None_3CV_iterative_es_if"): "3CV",
    ('knd', "10MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_SH-eta4-i_3CV_iterative_es_if"): "SH; 3CV",
    ('knd', "10MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_None_5CV_iterative_es_if"): "5CV",
    ('knd', "10MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_SH-eta4-i_5CV_iterative_es_if"): "SH; 5CV",
    ('knd', "10MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_None_10CV_iterative_es_if"): "10CV",
    ('knd', "10MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_SH-eta4-i_10CV_iterative_es_if"): "SH; 10CV",
    },
    60: {
    ('portfolio', "60MIN/ASKL_run_with_portfolio/360/RF/RF_None_holdout_iterative_es_if"): "holdout",
    ('portfolio', "60MIN/ASKL_run_with_portfolio/360/RF/RF_SH-eta4-i_holdout_iterative_es_if"): "SH; holdout",
    ('portfolio', "60MIN/ASKL_run_with_portfolio/360/RF/RF_None_3CV_iterative_es_if"): "3CV",
    ('portfolio', "60MIN/ASKL_run_with_portfolio/360/RF/RF_SH-eta4-i_3CV_iterative_es_if"): "SH; 3CV",
    ('portfolio', "60MIN/ASKL_run_with_portfolio/360/RF/RF_None_5CV_iterative_es_if"): "5CV",
    ('portfolio', "60MIN/ASKL_run_with_portfolio/360/RF/RF_SH-eta4-i_5CV_iterative_es_if"): "SH; 5CV",
    ('portfolio', "60MIN/ASKL_run_with_portfolio/360/RF/RF_None_10CV_iterative_es_if"): "10CV",
    ('portfolio', "60MIN/ASKL_run_with_portfolio/360/RF/RF_SH-eta4-i_10CV_iterative_es_if"): "SH; 10CV",
    ('bo', "60MIN/ASKL_automldata/RF/RF_None_holdout_iterative_es_if"): "holdout",
    ('bo', "60MIN/ASKL_automldata/RF/RF_SH-eta4-i_holdout_iterative_es_if"): "SH; holdout",
    ('bo', "60MIN/ASKL_automldata/RF/RF_None_3CV_iterative_es_if"): "3CV",
    ('bo', "60MIN/ASKL_automldata/RF/RF_SH-eta4-i_3CV_iterative_es_if"): "SH; 3CV",
    ('bo', "60MIN/ASKL_automldata/RF/RF_None_5CV_iterative_es_if"): "5CV",
    ('bo', "60MIN/ASKL_automldata/RF/RF_SH-eta4-i_5CV_iterative_es_if"): "SH; 5CV",
    ('bo', "60MIN/ASKL_automldata/RF/RF_None_10CV_iterative_es_if"): "10CV",
    ('bo', "60MIN/ASKL_automldata/RF/RF_SH-eta4-i_10CV_iterative_es_if"): "SH; 10CV",
    ('knd', "60MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_None_holdout_iterative_es_if"): "holdout",
    ('knd', "60MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_SH-eta4-i_holdout_iterative_es_if"): "SH; holdout",
    ('knd', "60MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_None_3CV_iterative_es_if"): "3CV",
    ('knd', "60MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_SH-eta4-i_3CV_iterative_es_if"): "SH; 3CV",
    ('knd', "60MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_None_5CV_iterative_es_if"): "5CV",
    ('knd', "60MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_SH-eta4-i_5CV_iterative_es_if"): "SH; 5CV",
    ('knd', "60MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_None_10CV_iterative_es_if"): "10CV",
    ('knd', "60MIN/ASKL_automldata_w_ensemble_w_knd/RF/RF_SH-eta4-i_10CV_iterative_es_if"): "SH; 10CV",
    },
}

task_ids = openml_automl_benchmark

res_dc = {}
per_cf_dc = {}
miss = 0
for horizon in list(valid_pretty.keys()):
    res_dc[horizon] = {}
    per_cf_dc[horizon] = {}
    for tid in task_ids:
        res_dc[horizon][tid] = {}
        per_cf_dc[horizon][tid] = {}
        for mode in list(valid_pretty[horizon].keys()):
            if type(mode) == tuple:
                auto, model_name = mode
            else:
                model_name = mode
                auto = None
            res_dc[horizon][tid][mode] = []
            for seed in range(10):
                if auto == "auto":
                    fl_tmpl = dir_ + "/" + model_name + "_%d_%d/result.json" % (tid, seed)
                elif auto == 'knd':
                    fl_tmpl = dir_ + "/" + model_name + "_%d_%d_25_0/result.json" % (tid, seed)
                else:
                    fl_tmpl = dir_ + "/" + model_name + "_%d_%d_0_0/result.json" % (tid, seed)
                fl = glob.glob(fl_tmpl)               
                if len(fl) == 0:
                    miss += 1
                    print(fl_tmpl)
                    continue
                fl = fl[0]
                with open(fl, "r") as fh:
                    line = json.load(fh)
                    loss = line["0"]["trajectory"]
                    loss = [(l[0], l[1]) for l in loss]
                    loss = dict(loss)
                    res_dc[horizon][tid][mode].append(loss)

            """
            # get the same information per config
            per_cf_dc[horizon][tid][mode] = []
            for seed in range(10):
                if auto == "auto":
                    fl_tmpl = dir_ + "/" + model_name + "_%d_%d/auto-sklearn-output/*/*/runhistory.json" % (tid, seed)
                elif 'knd':
                    fl_tmpl = dir_ + "/" + model_name + "_%d_%d_25_0/auto-sklearn-output/*/*/runhistory.json" % (tid, seed)
                else:
                    fl_tmpl = dir_ + "/" + model_name + "_%d_%d_0_0/auto-sklearn-output/*/*/runhistory.json" % (tid, seed)
                fl = glob.glob(fl_tmpl)               
                if len(fl) == 0:
                    if len(res_dc[horizon][tid][mode][seed]) == 1:
                        traj = dict([(i, 1.0) for i in range(32)])
                        per_cf_dc[horizon][tid][mode].append(traj)
                        continue
                    miss += 1
                    print(fl_tmpl)
                    continue
                fl = fl[0]
                with open(fl, "r") as fh:
                    line = json.load(fh)   
                    line = line["data"]
                    val_losses = []
                    test_losses = []
                    for i in range(len(line)):
                        try:
                            val_loss = line[i][1][0]
                            try:
                                # was this a crash?
                                test_loss = line[i][1][3]["test_loss"]
                            except:
                                test_loss = 1
                        except:
                            val_loss = val_losses[-1]
                            test_loss = test_losses[-1]
                        val_losses.append(val_loss)
                        test_losses.append(test_loss)
                    traj = [test_losses[0], ]
                    b = val_losses[0]
                    for v, t in zip(val_losses[1:], test_losses[1:]):
                        if v < b:
                            b = v
                            traj.append(t)
                        else:
                            traj.append(traj[-1])
                    traj = dict([(i+1, traj[i]) for i in range(len(line))])
                    per_cf_dc[horizon][tid][mode].append(traj)
            """
            
print("Missing %d entries" % miss)

In [None]:
# Load some stuff from disc and openml - takes some time
tasks, task_ids_sorted_by_num_features = load_from_openml(task_ids)
min_diff_dc = get_normalization_constants(dir_, load=True)
tasks

In [None]:
HORIZON = 60
for tid in task_ids_sorted_by_num_features:
    plt.figure(figsize=[8,6])
    colors = itertools.cycle(style_dc["colors"])
    for mode in list(valid_pretty[HORIZON].keys()):
        if not ("3CV" in mode[1] and "SH" not in mode[1]):
            continue
        c = next(colors)
        #tmp = pd.DataFrame(per_cf_dc[HORIZON][tid][mode]).sort_index(axis=1).ffill(axis=1)
        tmp = pd.DataFrame(res_dc[HORIZON][tid][mode]).sort_index(axis=1).ffill(axis=1)
        med = tmp.median(axis=0)
        med.loc[HORIZON*60] = med.iloc[-1]
        low = tmp.quantile(0.25)
        low.loc[HORIZON*60] = low.iloc[-1]
        up = tmp.quantile(0.75, axis=0)
        up.loc[HORIZON*60] = up.iloc[-1]
        if mode[0] == 'knd':
            label = valid_pretty[HORIZON][mode] + " knd"
        elif mode[0] == 'bo':
            label = valid_pretty[HORIZON][mode] + " w/o"
        elif mode[0] == 'portfolio':
            label = valid_pretty[HORIZON][mode] + " portf"
        else:
            raise ValueError()
        plt.plot(med.index, med.to_numpy(), label=label, linewidth=3)
        
        plt.fill_between(med.index, low, up, alpha=0.3)
    plt.title('Name: %s (%d), #instances: %d, #attributes: %d' % (
        tasks.loc[tid, 'name'], tid, tasks.loc[tid, 'NumberOfInstances'], tasks.loc[tid, 'NumberOfFeatures']))
    plt.legend(fontsize=style_dc["fontsize"])
    plt.ylim([plt.ylim()[0], plt.ylim()[0] + 0.2*(plt.ylim()[1]-plt.ylim()[0])])
    #plt.xscale("log")
    plt.show()

In [None]:
# Plot average BER across all datasets
HORIZON = 10
tmp_dc = {HORIZON: {}}
model_list = []
for m in valid_pretty[HORIZON].keys():
    if '10CV' not in m[1] or 'SH' in m[1]:
        continue
    model_list.append(m)
    tmp_dc[HORIZON][m] = valid_pretty[HORIZON][m] + " " + str(m[0])

average_plot(model_list=model_list, res_dc=res_dc, valid_pretty=tmp_dc,
             horizon=HORIZON, task_ids_sorted_by_num_features=task_ids_sorted_by_num_features,
             min_diff_dc=min_diff_dc)
plt.yscale("log")
plt.show()

In [None]:
# Plot average BER across all datasets
HORIZON = 60
tmp_dc = {HORIZON: {}}
model_list = []
for m in valid_pretty[HORIZON].keys():
    if '10CV' not in m[1] or 'SH' in m[1]:
        continue
    model_list.append(m)
    tmp_dc[HORIZON][m] = valid_pretty[HORIZON][m] + " " + str(m[0])

average_plot(model_list=model_list, res_dc=res_dc, valid_pretty=tmp_dc,
             horizon=HORIZON, task_ids_sorted_by_num_features=task_ids_sorted_by_num_features,
             min_diff_dc=min_diff_dc)
plt.yscale("log")
plt.show()

In [None]:
# Generate final table
tab_data = {}

# Assume we have either all or no models
horizon_list = sorted(list(valid_pretty.keys()))
model_list = {}
for h in horizon_list:
    model_list[h] = []
    for m in valid_pretty[h].keys():
        model_list[h].append((m,valid_pretty[h][m]))

stat_test_data = {}
for horizon in horizon_list:
    #tab_data[horizon] = {}
    tab_data[str(horizon) + " bo"] = {}
    tab_data[str(horizon) + " Portfolio"] = {}
    tab_data[str(horizon) + " KND"] = {}
    stat_test_data[horizon] = {}
    stat_test_data[str(horizon) + " bo"] = {}
    stat_test_data[str(horizon) + " Portfolio"] = {}
    stat_test_data[str(horizon) + " KND"] = {}
    #tab_data["STD %s" % horizon] = {}

    for mode in model_list[horizon]:
        mode = mode[0]
        assert mode in valid_pretty[horizon], (mode, valid_pretty[horizon].keys())
        # Use label, not actual key
        task_scores = []
        seed_means = []
        # Get means per tid
        for tid in task_ids_sorted_by_num_features:
            tmp = pd.DataFrame(res_dc[horizon][tid][mode]).sort_index(axis=1).ffill(axis=1)
            tmp = (tmp - min_diff_dc[tid][0]) / min_diff_dc[tid][1]
            task_scores.append(tmp.mean().iloc[-1])
        # Get vars per seed
        for s in range(10):
            vals_for_this_seed = []
            for tid in task_ids_sorted_by_num_features:
                try:
                    tmp_key = sorted(list(res_dc[horizon][tid][mode][s].keys()))[-1]
                except IndexError:
                    continue
                tmp = res_dc[horizon][tid][mode][s][tmp_key]
                tmp = (tmp - min_diff_dc[tid][0]) / min_diff_dc[tid][1]
                vals_for_this_seed.append(tmp)
            seed_means.append(np.mean(vals_for_this_seed))
        seed_means = np.array(seed_means)
        if mode[0] == 'knd':
            tab_data[str(horizon) + " KND"][valid_pretty[horizon][mode]] = np.round(np.mean(task_scores)*100, 2)
            stat_test_data[str(horizon) + " KND"][valid_pretty[horizon][mode]] = task_scores
        elif mode[0] == 'bo':
            tab_data[str(horizon) + " bo"][valid_pretty[horizon][mode]] = np.round(np.mean(task_scores)*100, 2)
            stat_test_data[str(horizon) + " bo"][valid_pretty[horizon][mode]] = task_scores
        elif mode[0] == 'portfolio':
            tab_data[str(horizon) + " Portfolio"][valid_pretty[horizon][mode]] = np.round(np.mean(task_scores)*100, 2)
            stat_test_data[str(horizon) + " Portfolio"][valid_pretty[horizon][mode]] = task_scores
        else:
            raise ValueError()
        #tab_data["STD %s" % horizon][valid_pretty[horizon][mode]] = np.round(np.std(seed_means*100), 2)

In [None]:
#["10 bo", "10 KND", "10 Portfolio", "60 bo", "60 KND", "60 Portfolio"]
tab_data = pd.DataFrame(tab_data)
print(tab_data.columns)
print(tab_data[['10 bo', '10 KND', '10 Portfolio', 
                '60 bo', '60 KND', '60 Portfolio']])
print(pd.DataFrame(tab_data)[['10 bo', '10 KND', '10 Portfolio', 
                              '60 bo', '60 KND', '60 Portfolio']].to_latex())

In [None]:
not_different = {}
for h in horizon_list:
    not_different[h] = []
    best = 100
    best_m = None
    for m1 in model_list[h]:
        one, two, three = None, None, None
        p1, p2, p3 = None, None, None
        if m1[0][0] == 'portfolio':
            one = stat_test_data[str(h) + ' Portfolio'][valid_pretty[h][m1[0]]]
            p1 = np.mean(one)
            # find method w/o port
            for m2 in model_list[h]:
                if m2[1] == m1[1] and m2[0][0] == 'bo':
                    two = stat_test_data[str(h) + " bo"][valid_pretty[h][m2[0]]]
                    p2 = np.mean(two)
                    break
            # find method k-nearest datasets
            for m3 in model_list[h]:
                if m3[1] == m1[1] and m3[0][0] == 'knd':
                    three = stat_test_data[str(h) + " KND"][valid_pretty[h][m3[0]]]
                    p3 = np.mean(three)
                    break
            assert one is not None
            assert two is not None
            assert three is not None
            if p1 < p2 and p1 < p3:
                opt = (one, m1)
                chal = ((two, m2), (three, m3))
            elif p2 < p1 and p2 < p3:
                opt = (two, m2)
                chal = ((one, m1), (three, m3))
            elif p3 < p1 and p3 < p2:
                opt = (three, m3)
                chal = ((one, m1), (two, m2))
            else:
                raise ValueError()
                continue
            for c in chal:
                s, p = scipy.stats.wilcoxon(x=opt[0], y=c[0], alternative="less")
                if p > 0.05:
                    not_different[h].append((c[1][1], opt[1][0][0], c[1][0][0], 
                                             p, np.mean(opt[0])*100, np.mean(c[0])*100))

for h in horizon_list:
    print("Not different with %d mins:\n\t" % h, "\n\t".join(["%s: %s vs %s: %g (%g;%g)" % n for n in not_different[h]]))

In [None]:
# Do ranking plot
HORIZON = 60
tmp_dc = {HORIZON: {}}
model_list = []
for m in valid_pretty[HORIZON].keys():
    if "holdout" in m[1] and "SH" not in m[1]:
        model_list.append(m)
    add = m[0]
    if len(add) < 4:
        add = str.upper(add)
    tmp_dc[HORIZON][m] = valid_pretty[HORIZON][m] + " " + str(add)
    
rank(model_list, res_dc, tmp_dc, HORIZON, task_ids_sorted_by_num_features, n_iter=200, steplength=5)