In [1]:
import glob
import os
import sys

automlbenchmark_path = ".."
amlb_dir = os.path.realpath(os.path.expanduser(automlbenchmark_path))
amlb_reports_dir = os.path.join(amlb_dir, 'reports')
for lib in [amlb_dir, amlb_reports_dir]:
    sys.path.insert(0, lib)

In [2]:
import numpy as np
import pandas as pd

In [3]:
%run ./reports_config.py

In [4]:
from report import draw_score_heatmap, draw_score_parallel_coord, draw_score_pointplot, draw_score_stripplot, draw_score_barplot\
                  ,prepare_results, render_leaderboard, render_metadata, render_summary
from report.config import *
from report.util import create_file, display
from report.visualizations.util import register_colormap, render_colormap, savefig

# Comparisons

## Compare custom runs

Here comparing results for one fold

In [5]:
constraint = "1h8c"
results_dir = "."
output_dir = "."

included_frameworks = []
excluded_frameworks = []
frameworks_sort_key = None
# frameworks_sort_key = lambda f: definitions[f]['key'] if 'key' in definitions[f] else f.lower()
frameworks_labels = None
# frameworks_labels = lambda l: definitions[l]['framework'].lower()
duplicates_handling = 'fail' # accepted values: 'fail', 'keep_first', 'keep_last', 'keep_none'
imputation = None
normalization = None
# normalization = (0, 'h2o', 'mean')
row_filter = None
# row_filter = lamdba r: r.fold == 0     #! r is a pd.Series
title_extra = ""
# register_colormap(config.colormap, ('colorblind', [1, 0, 2, 3, 4, 5]))

In [6]:
# this cell  is an example showing how to use/customize this notebook depending on your results

results_dir = "../results"
output_dir = "./tmp"
duplicates_handling = 'keep_last'
normalization = (0, 'constantpredictor', 'mean')
row_filter = lambda r: ~r.task.isin(['kddcup09_appetency', 'colleges'])

definitions = dict(
    constantpredictor=dict(
        ref = True,
        framework='constantpredictor_enc',
        results_files=glob.glob(f"{results_dir}/constantpredictor*/scores/results.csv")
    ),
    autogluon=dict(
        framework='AutoGluon',
        results_files=glob.glob(f"{results_dir}/autogluon*/scores/results.csv")
    ),
    autosklearn=dict(
        framework='autosklearn',
        results_files=glob.glob(f"{results_dir}/autosklearn*/scores/results.csv")
    ),
    h2oautoml=dict(
        framework='H2OAutoML',
        results_files=glob.glob(f"{results_dir}/h2oautoml*/scores/results.csv")
    ),
    tpot=dict(
        framework='TPOT',
        results_files=glob.glob(f"{results_dir}/tpot*/scores/results.csv")
    )
)

definitions

{'constantpredictor': {'ref': True,
  'framework': 'constantpredictor_enc',
  'results_files': ['../results/constantpredictor_enc.small.test.local.20210125T224943/scores/results.csv',
   '../results/constantpredictor_enc.small.test.local.20210125T233618/scores/results.csv',
   '../results/constantpredictor_enc.openml_t_34537.test.local.20210125T235231/scores/results.csv',
   '../results/constantpredictor_enc.small.test.local.20210125T232708/scores/results.csv',
   '../results/constantpredictor_enc.small.test.local.20210125T233030/scores/results.csv',
   '../results/constantpredictor_enc.validation.test.local.20210119T220812/scores/results.csv',
   '../results/constantpredictor_enc.test.test.local.20210119T213552/scores/results.csv',
   '../results/constantpredictor_enc.small.test.local.20210125T232506/scores/results.csv',
   '../results/constantpredictor_enc.openml_t_34537.1h8c.local.20210125T235309/scores/results.csv',
   '../results/constantpredictor_enc.small.test.local.20210125T225

In [7]:
runs = {k:v for k, v in definitions.items() 
        if (k in included_frameworks if included_frameworks else True) 
        and k not in excluded_frameworks}
runs

{'constantpredictor': {'ref': True,
  'framework': 'constantpredictor_enc',
  'results_files': ['../results/constantpredictor_enc.small.test.local.20210125T224943/scores/results.csv',
   '../results/constantpredictor_enc.small.test.local.20210125T233618/scores/results.csv',
   '../results/constantpredictor_enc.openml_t_34537.test.local.20210125T235231/scores/results.csv',
   '../results/constantpredictor_enc.small.test.local.20210125T232708/scores/results.csv',
   '../results/constantpredictor_enc.small.test.local.20210125T233030/scores/results.csv',
   '../results/constantpredictor_enc.validation.test.local.20210119T220812/scores/results.csv',
   '../results/constantpredictor_enc.test.test.local.20210119T213552/scores/results.csv',
   '../results/constantpredictor_enc.small.test.local.20210125T232506/scores/results.csv',
   '../results/constantpredictor_enc.openml_t_34537.1h8c.local.20210125T235309/scores/results.csv',
   '../results/constantpredictor_enc.small.test.local.20210125T225

In [8]:
def results_as_df(results_dict, row_filter=None):
    def apply_filter(res, filtr):
        r = res.results
        return r.loc[filtr(r)]

    if row_filter is None:
        row_filter = lambda r: True

    return pd.concat([apply_filter(res, lambda r: (r.framework==name) & row_filter(r)) 
                      for name, res in results_dict.items() 
                      if res is not None])  

In [9]:
ref_results = {name: prepare_results(run['results_files'], 
                                     renamings={run['framework']: name},
                                     exclusions=excluded_frameworks,
                                     normalization=normalization,
                                     duplicates_handling=duplicates_handling,
                                     ) 
               for name, run in runs.items() if runs[name].get('ref', False)}

Duplicate entries:
                                                          id constraint  \
task             fold framework                                           
kc1              0    constantpredictor    openml.org/t/3917       test   
                      constantpredictor    openml.org/t/3917       test   
phishingwebsites 0    constantpredictor   openml.org/t/34537       test   
                      constantpredictor   openml.org/t/34537       1h8c   
                 1    constantpredictor   openml.org/t/34537       test   
                      constantpredictor   openml.org/t/34537       1h8c   
segment          0    constantpredictor  openml.org/t/146822       test   
                      constantpredictor  openml.org/t/146822       test   
                      constantpredictor  openml.org/t/146822       test   
                      constantpredictor  openml.org/t/146822       test   
                      constantpredictor  openml.org/t/146822       test   
      

In [10]:
metadata = next(res for res in ref_results.values()).metadata

In [11]:
problem_types = pd.DataFrame(m.__dict__ for m in metadata.values())['type'].unique().tolist()

In [12]:
render_metadata(metadata, 
                filename=create_file(output_dir, "datasets", results_group, "metadata.csv"))

Unnamed: 0,task,name,type,dataset,nrows,nfeatures,nclasses,class_imbalance
8,openml.org/t/168868,APSFailure,binary,openml.org/d/41138,76000,171,2,54.272727
12,openml.org/t/146818,Australian,binary,openml.org/d/40981,690,15,2,1.247557
2,openml.org/t/9910,Bioresponse,binary,openml.org/d/4134,3751,1777,2,1.184624
5,openml.org/t/167125,Internet-Advertisements,binary,openml.org/d/40978,3279,1559,2,6.143791
1,openml.org/t/34537,PhishingWebsites,binary,openml.org/d/4534,11055,31,2,1.257044
13,openml.org/t/10101,blood-transfusion-service-center,binary,openml.org/d/1464,748,5,2,3.202247
14,openml.org/t/146821,car,multiclass,openml.org/d/40975,1728,7,4,18.615385
11,openml.org/t/2295,cholesterol,regression,openml.org/d/204,303,14,0,
15,openml.org/t/168908,christine,binary,openml.org/d/41142,5418,1637,2,1.0
16,openml.org/t/9981,cnae-9,multiclass,openml.org/d/1468,1080,857,9,1.0


In [13]:
all_ref_res = results_as_df(ref_results, row_filter)

In [14]:
runs_results = {name: prepare_results(run['results_files'], 
                                      renamings={run['framework']: name},
                                      exclusions=excluded_frameworks,
                                      imputation=imputation,
                                      normalization=normalization,
                                      ref_results=all_ref_res,
                                      duplicates_handling=duplicates_handling
                                      ) 
                for name, run in runs.items() if name not in ref_results}

Duplicate entries:
                                          id constraint    result   metric  \
task     fold framework                                                      
colleges 1    h2oautoml  openml.org/t/359942    testall       NaN     rmse   
              h2oautoml  openml.org/t/359942    testall       NaN     rmse   
              h2oautoml  openml.org/t/359942    testall       NaN     rmse   
              h2oautoml  openml.org/t/359942    testall       NaN     rmse   
              h2oautoml  openml.org/t/359942    testall       NaN     rmse   
              h2oautoml  openml.org/t/359942    testall       NaN     rmse   
              h2oautoml  openml.org/t/359942    testall       NaN     rmse   
              h2oautoml  openml.org/t/359942    testall       NaN     rmse   
iris     0    h2oautoml      openml.org/t/59       test  0.111617  logloss   
              h2oautoml      openml.org/t/59       test  0.216605  logloss   
              h2oautoml      openml.org/t/59 

In [15]:
all_res = pd.concat([
    all_ref_res, 
    results_as_df(runs_results, row_filter)
])

In [16]:
res_summary = render_summary('result', 
                             results=all_res)
res_summary.to_csv(create_file(output_dir, "tables", "results_summary.csv"))

Unnamed: 0_level_0,framework,constantpredictor,h2oautoml
type,task,Unnamed: 2_level_1,Unnamed: 3_level_1
binary,apsfailure,0.5,
binary,australian,0.5,
binary,bioresponse,0.5,
binary,blood-transfusion,0.5,
binary,christine,0.5,
binary,credit-g,0.5,
binary,dresses-sales,0.5,
binary,internet-advertisements,0.5,
binary,jasmine,0.5,
binary,kc1,0.5,


In [17]:
score_summary = render_summary('score', 
                               results=all_res)
score_summary.to_csv(create_file(output_dir, "tables", "score_summary.csv"))

Unnamed: 0_level_0,framework,constantpredictor,h2oautoml
type,task,Unnamed: 2_level_1,Unnamed: 3_level_1
binary,apsfailure,0.5,
binary,australian,0.5,
binary,bioresponse,0.5,
binary,blood-transfusion,0.5,
binary,christine,0.5,
binary,credit-g,0.5,
binary,dresses-sales,0.5,
binary,internet-advertisements,0.5,
binary,jasmine,0.5,
binary,kc1,0.5,


In [18]:
models_summary = render_summary('models_count', 
                                results=all_res)
models_summary.to_csv(create_file(output_dir, "tables", "models_summary.csv"))

Unnamed: 0_level_0,framework,constantpredictor,h2oautoml
type,task,Unnamed: 2_level_1,Unnamed: 3_level_1
binary,apsfailure,1,
binary,australian,1,
binary,bioresponse,1,
binary,blood-transfusion,1,
binary,christine,1,
binary,credit-g,1,
binary,dresses-sales,1,
binary,internet-advertisements,1,
binary,jasmine,1,
binary,kc1,1,


In [19]:
if normalization:
    norm_score_summary = render_summary('norm_score', 
                                        results=all_res)
    norm_score_summary.to_csv(create_file(output_dir, "tables", "normalized_score_summary.csv"))

Unnamed: 0_level_0,framework,constantpredictor,h2oautoml
type,task,Unnamed: 2_level_1,Unnamed: 3_level_1
binary,apsfailure,1,
binary,australian,1,
binary,bioresponse,1,
binary,blood-transfusion,1,
binary,christine,1,
binary,credit-g,1,
binary,dresses-sales,1,
binary,internet-advertisements,1,
binary,jasmine,1,
binary,kc1,1,


In [20]:
benchmark_leaderboard = render_leaderboard('score', 
                                           results=all_res,
                                           aggregate=True)
benchmark_leaderboard.to_csv(create_file(output_dir, "tables", "benchmark_leaderboard.csv"))

Unnamed: 0_level_0,framework,constantpredictor,h2oautoml
type,task,Unnamed: 2_level_1,Unnamed: 3_level_1
binary,apsfailure,1,
binary,australian,1,
binary,bioresponse,1,
binary,blood-transfusion,1,
binary,christine,1,
binary,credit-g,1,
binary,dresses-sales,1,
binary,internet-advertisements,1,
binary,jasmine,1,
binary,kc1,1,


In [21]:
if 'binary' in problem_types:
    fig = draw_score_heatmap('score',
                             results=all_res,
                             type_filter='binary', 
                             metadata=metadata,
                             x_labels=frameworks_labels or True,
                             x_sort_by=frameworks_sort_key,
                             y_sort_by='nrows',
                             title=f"Scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}",
                             center=0.5
                            );
    savefig(fig, create_file(output_dir, "visualizations", "binary_score_heat.png"))

TypeError: object of type 'NoneType' has no len()

In [None]:
if 'multiclass' in problem_types:
    fig = draw_score_heatmap('score', 
                             results=all_res,
                             type_filter='multiclass', 
                             metadata=metadata,
                             x_labels=frameworks_labels,
                             x_sort_by=frameworks_sort_key,
                             y_sort_by='nrows',
                             title=f"Scores ({multiclass_score_label}) on {results_group} multi-class classification problems{title_extra}",
                             center=0
                            );
    savefig(fig, create_file(output_dir, "visualizations", "multiclass_score_heat.png"))

In [None]:
if 'regression' in problem_types:
    fig = draw_score_heatmap('score', 
                             results=all_res,
                             type_filter='regression', 
                             metadata=metadata,
                             x_labels=frameworks_labels,
                             x_sort_by=frameworks_sort_key,
                             y_sort_by='nrows',
                             title=f"Scores ({regression_score_label}) on {results_group} regression problems{title_extra}",
                             center=0
                            );
    savefig(fig, create_file(output_dir, "visualizations", "regression_score_heat.png"))

In [None]:
render_colormap(config.colormap)

In [None]:
if 'binary' in problem_types:
    fig = draw_score_barplot('score',
                             results=all_res,
                             type_filter='binary', 
                             metadata=metadata,
                             x_sort_by=tasks_sort_by,
                             ylabel=binary_score_label,
                             ylim=dict(bottom=.5),
                             hue_sort_by=frameworks_sort_key, 
                             ci=95,
                             title=f"Scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}",
                             legend_loc='lower center',
                             legend_labels=frameworks_labels,
                            );
    savefig(fig, create_file(output_dir, "visualizations", "binary_score_barplot.png"))

In [None]:
if 'multiclass' in problem_types:
    fig = draw_score_barplot('score',
                             results=all_res,
                             type_filter='multiclass', 
                             metadata=metadata,
                             x_sort_by=tasks_sort_by,
                             ylabel=multiclass_score_label,
                             ylim=dict(top=0.1),
                             hue_sort_by=frameworks_sort_key,
                             ci=95,
                             title=f"Scores ({multiclass_score_label}) on {results_group} multiclass classification problems{title_extra}",
                             legend_loc='lower center',
                             legend_labels=frameworks_labels,
                            );
    savefig(fig, create_file(output_dir, "visualizations", "multiclass_score_barplot.png"))

In [None]:
if 'regression' in problem_types:
    fig = draw_score_barplot('score',
                             results=all_res,
                             type_filter='regression', 
                             metadata=metadata,
                             x_sort_by=tasks_sort_by,
                             yscale='symlog',
                             ylabel=regression_score_label,
                             ylim=dict(top=0.1),
                             hue_sort_by=frameworks_sort_key, 
                             ci=95,
                             title=f"Scores ({regression_score_label}) on {results_group} regression classification problems{title_extra}",
                             legend_loc='lower center',
                             legend_labels=frameworks_labels,
                             size=(8, 6),
                            );
    savefig(fig, create_file(output_dir, "visualizations", "regression_score_barplot.png"))

In [None]:
if 'binary' in problem_types:
    fig = draw_score_pointplot('score',
                               results=all_res,
                               type_filter='binary', 
                               metadata=metadata,
                               x_sort_by=tasks_sort_by,
                               ylabel=binary_score_label,
                               ylim=dict(bottom=.5),
                               hue_sort_by=frameworks_sort_key,
                               join='none', marker='hline_xspaced', ci=95, 
                               title=f"Scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}",
                               legend_loc='lower center',
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "binary_score_pointplot.png"))

In [None]:
if 'multiclass' in problem_types:
    fig = draw_score_pointplot('score',
                               results=all_res,
                               type_filter='multiclass', 
                               metadata=metadata,
                               x_sort_by=tasks_sort_by,
                               ylabel=multiclass_score_label,
                               hue_sort_by=frameworks_sort_key,
                               join='none', marker='hline_xspaced', ci=95, 
                               title=f"Scores ({multiclass_score_label}) on {results_group} multiclass classification problems{title_extra}",
                               legend_loc='lower center',
                               legend_labels=frameworks_labels),
                              );
    savefig(fig, create_file(output_dir, "visualizations", "multiclass_score_pointplot.png"))

In [None]:
if 'regression' in problem_types:
    fig = draw_score_pointplot('score',
                               results=all_res,
                               type_filter='regression', 
                               metadata=metadata,
                               x_sort_by=tasks_sort_by,
                               ylabel=regression_score_label,
                               yscale='symlog',
                               ylim=dict(top=0.1),
                               hue_sort_by=frameworks_sort_key,
                               join='none', marker='hline_xspaced', ci=95, 
                               title=f"Scores ({regression_score_label}) on {results_group} regression classification problems{title_extra}",
                               legend_loc='lower center',
                               legend_labels=frameworks_labels,
                               size=(8, 6),
                              );
    savefig(fig, create_file(output_dir, "visualizations", "regression_score_pointplot.png"))

In [None]:
if 'binary' in problem_types:
    fig = draw_score_stripplot('score', 
                               results=all_res.sort_values(by=['framework']),
                               type_filter='binary', 
                               metadata=metadata,
                               xlabel=binary_score_label,
                               y_sort_by=tasks_sort_by,
                               hue_sort_by=frameworks_sort_key,
                               title=f"Scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}",
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "binary_score_stripplot.png"))

In [None]:
if 'multiclass' in problem_types:
    fig = draw_score_stripplot('score', 
                               results=all_res.sort_values(by=['framework']),
                               type_filter='multiclass', 
                               metadata=metadata,
                               xlabel=multiclass_score_label,
                               xscale='symlog',
                               y_sort_by=tasks_sort_by,
                               hue_sort_by=frameworks_sort_key,
                               title=f"Scores ({multiclass_score_label}) on {results_group} multi-class classification problems{title_extra}",
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "multiclass_score_stripplot.png"))

In [None]:
if 'regression' in problem_types:
    fig = draw_score_stripplot('score', 
                               results=all_res,
                               type_filter='regression', 
                               metadata=metadata,
                               xlabel=regression_score_label,
                               xscale='symlog',
                               y_sort_by=tasks_sort_by,
                               hue_sort_by=frameworks_sort_key,
                               title=f"Scores ({regression_score_label}) on {results_group} regression problems{title_extra}",
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "regression_score_stripplot.png"))

In [None]:
if 'binary' in problem_types and normalization:
    fig = draw_score_stripplot('norm_score', 
                               results=all_res,
                               type_filter='binary', 
                               metadata=metadata,
                               xlabel=f"rel. {binary_score_label}",
                               y_sort_by='nrows',
                               hue_sort_by=frameworks_sort_key,
                               title=f"Relative scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}",
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "binary_rel_score_stripplot.png"))

In [None]:
if 'multiclass' in problem_types and normalization:
    fig = draw_score_stripplot('norm_score', 
                               results=all_res,
                               type_filter='multiclass', 
                               metadata=metadata,
                               xlabel=f"rel. {multiclass_score_label}",
                               xscale='symlog',
                               y_sort_by='nrows',
                               hue_sort_by=frameworks_sort_key,
                               title=f"Relative scores ({multiclass_score_label}) on {results_group} multi-class classification problems{title_extra}",
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "multiclass_rel_score_stripplot.png"))

In [None]:
if 'regression' in problem_types and normalization:
    fig = draw_score_stripplot('norm_score', 
                               results=all_res,
                               type_filter='regression', 
                               metadata=metadata,
                               xlabel=f"rel. {regression_score_label}",
                               y_sort_by='nrows',
                               hue_sort_by=frameworks_sort_key,
                               title=f"Relative scores ({regression_score_label}) on {results_group} regression problems{title_extra}",
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "regression_rel_score_stripplot.png"))