# Benchmark results reporting

## Setup

### Prerequirements
This notebook requires a kernel running Python 3.5+.
You can skip this section if the kernel is already configured.

In [None]:
!pip install -r ./requirements.txt
#!pip install jupyter_contrib_nbextensions
#!jupyter contrib nbextension install --user
#!jupyter nbextension enable python-markdown/main
#!pip install jupyter_nbextensions_configurator
#!jupyter nbextensions_configurator enable --user

### Imports and selection of the results directory

In [None]:
import glob
import os
import sys

automlbenchmark_path = ".."
amlb_dir = os.path.realpath(os.path.expanduser(automlbenchmark_path))
for lib in [amlb_dir]:
    sys.path.insert(0, lib)

In [None]:
import numpy as np
import pandas as pd

In [None]:
from amlb_report import draw_score_heatmap, draw_score_parallel_coord, draw_score_pointplot, draw_score_stripplot, draw_score_barplot\
                 , prepare_results, render_leaderboard, render_metadata, render_summary
from amlb_report.util import create_file, display
from amlb_report.visualizations.util import register_colormap, render_colormap, savefig
import amlb_report.config as config

## Results

#### Loading results, formatting and adding columns
- `result` is the raw result metric computed from predictions at the end the benchmark.
    For classification problems, it is usually `auc` for binomial classification and `neg_logloss` for multinomial classification (higher is always better).
- `norm_result` is a normalization of `result` on a `[0, 1]` scale, with `{{normalization[0]}}` result as `0` and `{{normalization[1]}}` result as `1`.
- `imp_result` for imputed results. Given a task and a framework:
    - if **all folds results are missing**, then no imputation occurs, and the result is `nan` for each fold.
    - if **only some folds results are missing**, then the missing result can be imputed by setting `{{imputation='framework'}}` and use that framework to impute the result for this fold.

### Default config

In [None]:
#! avoid editing this cell: custom config should be applied in the next cell.

constraint = "1h8c"
results_dir = "."
output_dir = "."

tasks_sort_by = 'nrows'
results_group = ''

included_frameworks = []
excluded_frameworks = []
frameworks_sort_key = None
# frameworks_sort_key = lambda f: definitions[f]['key'] if 'key' in definitions[f] else f.lower()
frameworks_labels = None
# frameworks_labels = lambda l: definitions[l]['framework'].lower()
duplicates_handling = 'fail' # accepted values: 'fail', 'keep_first', 'keep_last', 'keep_none'
imputation = None
normalization = None
# normalization = (0, 'h2o', 'mean')
row_filter = None
# row_filter = lamdba r: r.fold == 0     #! r is a pd.Series
title_extra = ""
binary_result_label = 'AUC'
multiclass_result_label = 'neg. Log loss'
regression_result_label = 'neg. RMSE'

# register_colormap(config.colormap, ('colorblind', [1, 0, 2, 3, 4, 5]))

### Config and results definitions for current run

In [None]:
# this cell  is an example showing how to use/customize this notebook depending on your results
config.nfolds = 1

results_dir = "../results"
output_dir = "./tmp"
duplicates_handling = 'keep_last'
normalization = (0, 'constantpredictor', 'mean')  # normalizes results between 0 and constantpredictor
# row_filter = lambda r: ~r.task.isin(['kddcup09_appetency', 'colleges'])

definitions = dict(
    constantpredictor=dict(
        ref = True,
        framework='constantpredictor_enc',
        results=glob.glob(f"{results_dir}/constantpredictor*/scores/results.csv")
    ),
    autogluon=dict(
        framework='AutoGluon',
        results=glob.glob(f"{results_dir}/autogluon*/scores/results.csv")
    ),
    autosklearn=dict(
        framework='autosklearn',
        results=glob.glob(f"{results_dir}/autosklearn*/scores/results.csv")
    ),
    h2oautoml=dict(
        framework='H2OAutoML',
        results=glob.glob(f"{results_dir}/h2oautoml*/scores/results.csv")
    ),
    tpot=dict(
        framework='TPOT',
        results=glob.glob(f"{results_dir}/tpot*/scores/results.csv")
    ),
#     rf=dict(
#         framework='RandomForest',
#         results=my_results_df[my_results_df['framework']=='RandomForest']  # example showing that we can also use a dataframe (or its subset)
#     )
)

#definitions

## Load and prepare results

In [None]:
runs = {k:v for k, v in definitions.items() 
        if (k in included_frameworks if included_frameworks else True) 
        and k not in excluded_frameworks}
#runs

In [None]:
def results_as_df(results_dict, row_filter=None):
    def apply_filter(res, filtr):
        r = res.results
        return r.loc[filtr(r)]

    if row_filter is None:
        row_filter = lambda r: True

    return pd.concat([apply_filter(res, lambda r: (r.framework==name) & row_filter(r)) 
                      for name, res in results_dict.items() 
                      if res is not None])  

In [None]:
ref_results = {name: prepare_results(run['results'], 
                                     renamings={run['framework']: name},
                                     exclusions=excluded_frameworks,
                                     normalization=normalization,
                                     duplicates_handling=duplicates_handling,
                                     include_metadata=True
                                     ) 
               for name, run in runs.items() if runs[name].get('ref', False)}

In [None]:
all_ref_res = results_as_df(ref_results, row_filter)

In [None]:
runs_results = {name: prepare_results(run['results'], 
                                      renamings={run['framework']: name},
                                      exclusions=excluded_frameworks,
                                      imputation=imputation,
                                      normalization=normalization,
                                      ref_results=all_ref_res,
                                      duplicates_handling=duplicates_handling
                                      ) 
                for name, run in runs.items() if name not in ref_results}

In [None]:
all_res = pd.concat([
    all_ref_res, 
    results_as_df(runs_results, row_filter)
])
all_results = {**ref_results, **runs_results}

In [None]:
from functools import reduce
metadata = reduce(lambda l, r: {**r, **l}, 
                  [res.metadata 
                   for res in list(ref_results.values())+list(runs_results.values()) 
                   if res is not None],
                  {})
# metadata = next(res for res in ref_results.values()).metadata

In [None]:
problem_types = pd.DataFrame(m.__dict__ for m in metadata.values())['type'].unique().tolist()

## Tasks lists

In [None]:
merged_res = pd.concat([r.done.reset_index() for r in all_results.values() if r is not None])
merged_res = merged_res[merged_res['id'].notna()]
merged_results = prepare_results(merged_res)

def render_tasks_by_state(state='done'):
#     tasks = pd.concat([getattr(r, state).reset_index()
#                         .groupby(['task', 'framework'])['fold']
#                         .unique()
#                        for r in all_results.values()
#                        if r is not None])
    tasks = (getattr(merged_results, state).reset_index()
                .groupby(['task', 'framework'])['fold']
                .unique())
    display(tasks, pretty=True)
    # display(tabulate(done, tablefmt='plain'))

In [None]:
render_metadata(metadata, 
                filename=create_file(output_dir, "datasets", results_group, "metadata.csv"))

### Completed tasks/folds

In [None]:
render_tasks_by_state('done')

### Missing or crashed/aborted tasks/folds

In [None]:
render_tasks_by_state('missing')

### Failing tasks/folds

In [None]:
render_tasks_by_state('failed')

In [None]:
failures = (merged_results.failed.groupby(['task', 'fold', 'framework'])['info']
                          .unique())
display(failures)

### Results anomalies

In [None]:
from amlb_report.analysis import list_outliers

display(list_outliers('result', 
                      results=merged_results.results,
#                       results=merged_results.loc[merged_results.framework=='h2oautoml']
                      z_threshold=2.5,
                     ))

## Data Reports

### Results summary

Averaging using arithmetic mean over fold `result`.
In following summaries, if not mentioned otherwise, and if results imputation was enabled, the means are computed over imputed results .
Given a task and a framework:
- if **all folds results are missing**, then no imputation occured, and the mean result is `nan`.
- if **only some folds results are missing**, then the amount of imputed results that contributed to the mean are displayed between parenthesis.

#### Number of models trained

When available, displays the average amount of models trained by the framework for each dataset.

This amount should be interpreted differently for each framework.
For example, with *RandomForest*, this amount corresponds to the number of trees.

In [None]:
models_summary = render_summary('models_count', 
                                results=all_res)
models_summary.to_csv(create_file(output_dir, "tables", "models_summary.csv"))

### Resuls mean

In [None]:
res_summary = render_summary('result', 
                             results=all_res)
res_summary.to_csv(create_file(output_dir, "tables", "results_summary.csv"))

In [None]:
if normalization:
    norm_result_summary = render_summary('norm_result', 
                                         results=all_res)
    norm_result_summary.to_csv(create_file(output_dir, "tables", "normalized_result_summary.csv"))

### Tasks leaderboard

In [None]:
benchmark_leaderboard = render_leaderboard('result', 
                                           results=all_res,
                                           aggregate=True)
benchmark_leaderboard.to_csv(create_file(output_dir, "tables", "benchmark_leaderboard.csv"))

## Visualizations

In [None]:
render_colormap(config.colormap)

### Heatmaps

In [None]:
if 'binary' in problem_types:
    fig = draw_score_heatmap('result',
                             results=all_res,
                             type_filter='binary', 
                             metadata=metadata,
                             x_labels=frameworks_labels or True,
                             x_sort_by=frameworks_sort_key,
                             y_sort_by='nrows',
                             title=f"Results ({binary_result_label}) on {results_group} binary classification problems{title_extra}",
                             center=0.5
                            );
    savefig(fig, create_file(output_dir, "visualizations", "binary_result_heat.png"))

In [None]:
if 'multiclass' in problem_types:
    fig = draw_score_heatmap('result', 
                             results=all_res,
                             type_filter='multiclass', 
                             metadata=metadata,
                             x_labels=frameworks_labels  or True,
                             x_sort_by=frameworks_sort_key,
                             y_sort_by='nrows',
                             title=f"Results ({multiclass_result_label}) on {results_group} multi-class classification problems{title_extra}",
                             center=0
                            );
    savefig(fig, create_file(output_dir, "visualizations", "multiclass_result_heat.png"))

In [None]:
if 'regression' in problem_types:
    fig = draw_score_heatmap('result', 
                             results=all_res,
                             type_filter='regression', 
                             metadata=metadata,
                             x_labels=frameworks_labels  or True,
                             x_sort_by=frameworks_sort_key,
                             y_sort_by='nrows',
                             title=f"Results ({regression_result_label}) on {results_group} regression problems{title_extra}",
                             center=0
                            );
    savefig(fig, create_file(output_dir, "visualizations", "regression_result_heat.png"))

### Bar plots

In [None]:
if 'binary' in problem_types:
    fig = draw_score_barplot('result',
                             results=all_res,
                             type_filter='binary', 
                             metadata=metadata,
                             x_sort_by=tasks_sort_by,
                             ylabel=binary_result_label,
                             ylim=dict(bottom=.5),
                             hue_sort_by=frameworks_sort_key, 
                             ci=95,
                             title=f"Results ({binary_result_label}) on {results_group} binary classification problems{title_extra}",
                             legend_loc='lower center',
                             legend_labels=frameworks_labels,
                            );
    savefig(fig, create_file(output_dir, "visualizations", "binary_result_barplot.png"))

In [None]:
if 'multiclass' in problem_types:
    fig = draw_score_barplot('result',
                             results=all_res,
                             type_filter='multiclass', 
                             metadata=metadata,
                             x_sort_by=tasks_sort_by,
                             ylabel=multiclass_result_label,
                             ylim=dict(top=0.1),
                             hue_sort_by=frameworks_sort_key,
                             ci=95,
                             title=f"Results ({multiclass_result_label}) on {results_group} multiclass classification problems{title_extra}",
                             legend_loc='lower center',
                             legend_labels=frameworks_labels,
                            );
    savefig(fig, create_file(output_dir, "visualizations", "multiclass_result_barplot.png"))

In [None]:
if 'regression' in problem_types:
    fig = draw_score_barplot('result',
                             results=all_res,
                             type_filter='regression', 
                             metadata=metadata,
                             x_sort_by=tasks_sort_by,
                             yscale='symlog',
                             ylabel=regression_result_label,
                             ylim=dict(top=0.1),
                             hue_sort_by=frameworks_sort_key, 
                             ci=95,
                             title=f"Results ({regression_result_label}) on {results_group} regression classification problems{title_extra}",
                             legend_loc='lower center',
                             legend_labels=frameworks_labels,
                             size=(8, 6),
                            );
    savefig(fig, create_file(output_dir, "visualizations", "regression_result_barplot.png"))

### Point plots

In [None]:
if 'binary' in problem_types:
    fig = draw_score_pointplot('result',
                               results=all_res,
                               type_filter='binary', 
                               metadata=metadata,
                               x_sort_by=tasks_sort_by,
                               ylabel=binary_result_label,
                               ylim=dict(bottom=.5),
                               hue_sort_by=frameworks_sort_key,
                               join='none', marker='hline_xspaced', ci=95, 
                               title=f"Results ({binary_result_label}) on {results_group} binary classification problems{title_extra}",
                               legend_loc='lower center',
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "binary_result_pointplot.png"))

In [None]:
if 'multiclass' in problem_types:
    fig = draw_score_pointplot('result',
                               results=all_res,
                               type_filter='multiclass', 
                               metadata=metadata,
                               x_sort_by=tasks_sort_by,
                               ylabel=multiclass_result_label,
                               hue_sort_by=frameworks_sort_key,
                               join='none', marker='hline_xspaced', ci=95, 
                               title=f"Results ({multiclass_result_label}) on {results_group} multiclass classification problems{title_extra}",
                               legend_loc='lower center',
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "multiclass_result_pointplot.png"))

In [None]:
if 'regression' in problem_types:
    fig = draw_score_pointplot('result',
                               results=all_res,
                               type_filter='regression', 
                               metadata=metadata,
                               x_sort_by=tasks_sort_by,
                               ylabel=regression_result_label,
                               yscale='symlog',
                               ylim=dict(top=0.1),
                               hue_sort_by=frameworks_sort_key,
                               join='none', marker='hline_xspaced', ci=95, 
                               title=f"Results ({regression_result_label}) on {results_group} regression classification problems{title_extra}",
                               legend_loc='lower center',
                               legend_labels=frameworks_labels,
                               size=(8, 6),
                              );
    savefig(fig, create_file(output_dir, "visualizations", "regression_result_pointplot.png"))

### Strip plots

In [None]:
if 'binary' in problem_types:
    fig = draw_score_stripplot('result', 
                               results=all_res.sort_values(by=['framework']),
                               type_filter='binary', 
                               metadata=metadata,
                               xlabel=binary_result_label,
                               y_sort_by=tasks_sort_by,
                               hue_sort_by=frameworks_sort_key,
                               title=f"Results ({binary_result_label}) on {results_group} binary classification problems{title_extra}",
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "binary_result_stripplot.png"))

In [None]:
if 'multiclass' in problem_types:
    fig = draw_score_stripplot('result', 
                               results=all_res.sort_values(by=['framework']),
                               type_filter='multiclass', 
                               metadata=metadata,
                               xlabel=multiclass_result_label,
                               xscale='symlog',
                               y_sort_by=tasks_sort_by,
                               hue_sort_by=frameworks_sort_key,
                               title=f"Results ({multiclass_result_label}) on {results_group} multi-class classification problems{title_extra}",
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "multiclass_result_stripplot.png"))

In [None]:
if 'regression' in problem_types:
    fig = draw_score_stripplot('result', 
                               results=all_res,
                               type_filter='regression', 
                               metadata=metadata,
                               xlabel=regression_result_label,
                               xscale='symlog',
                               y_sort_by=tasks_sort_by,
                               hue_sort_by=frameworks_sort_key,
                               title=f"Results ({regression_result_label}) on {results_group} regression problems{title_extra}",
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "regression_result_stripplot.png"))

### Normalized strip plots

In [None]:
if 'binary' in problem_types and normalization:
    fig = draw_score_stripplot('norm_result', 
                               results=all_res,
                               type_filter='binary', 
                               metadata=metadata,
                               xlabel=f"rel. {binary_result_label}",
                               y_sort_by='nrows',
                               hue_sort_by=frameworks_sort_key,
                               title=f"Relative results ({binary_result_label}) on {results_group} binary classification problems{title_extra}",
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "binary_rel_result_stripplot.png"))

In [None]:
if 'multiclass' in problem_types and normalization:
    fig = draw_score_stripplot('norm_result', 
                               results=all_res,
                               type_filter='multiclass', 
                               metadata=metadata,
                               xlabel=f"rel. {multiclass_result_label}",
                               xscale='symlog',
                               y_sort_by='nrows',
                               hue_sort_by=frameworks_sort_key,
                               title=f"Relative results ({multiclass_result_label}) on {results_group} multi-class classification problems{title_extra}",
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "multiclass_rel_result_stripplot.png"))

In [None]:
if 'regression' in problem_types and normalization:
    fig = draw_score_stripplot('norm_result', 
                               results=all_res,
                               type_filter='regression', 
                               metadata=metadata,
                               xlabel=f"rel. {regression_result_label}",
                               y_sort_by='nrows',
                               hue_sort_by=frameworks_sort_key,
                               title=f"Relative results ({regression_result_label}) on {results_group} regression problems{title_extra}",
                               legend_labels=frameworks_labels,
                              );
    savefig(fig, create_file(output_dir, "visualizations", "regression_rel_result_stripplot.png"))

## Playground