In [2]:
# Author: noe.sturm@novartis.com

In [3]:
import os, sys
import numpy as np
import pandas as pd
import importlib
from scipy.io import mmread
sys.path.append('/path/to/repo/performance_evaluation/development_code/')
import modeval


In [None]:
importlib.reload(modeval) # dev stuff

In [6]:
# specify sparsechem model and results folders
result_dir = 'results/'
model_dir = 'models/'

This demonstration is based on a toy HP grid search made with the chembl dataset and trivial LSH key.<br>
For the purpose of this demonstration, the HP search was restrained to a 10 epochs and searched a small number of hyperparameters

### Get the best hyperparameter settings from metrics.csv files

In [9]:
# collect performance metrics from metrics file. 
# this assumes metrics files are named after the hyperparameters (see below into details of perf_from_metrics() )
metrics_df = modeval.perf_from_metrics(result_dir, verbose=True)

# Disclaimer: this does not consider the --min_samples criteria, all tasks are considered!

# melt 
metrics_dfm = modeval.melt_perf(metrics_df, perf_metrics=['auc_pr_va', 'auc_va', 'max_f1_va', 'kappa_va', 'avg_prec_va'])

# Find out the best hyperparameters: one row per score type gives the best HP per score type
modeval.best_hyperparam(metrics_dfm)

Loaded 30 metrics files


Unnamed: 0,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,score_type,value
17,2000,0.6,1e-05,0.001,5,10,auc_pr_va,0.715337
0,2000,0.6,1e-05,0.001,5,10,auc_va,0.770708
12,2000,0.6,1e-05,0.001,5,10,avg_prec_va,0.735341
24,2000,0.6,1e-05,0.001,5,10,kappa_va,0.285665
4,2000,0.6,1e-05,0.001,5,10,max_f1_va,0.75846


### Get the best hyperparameter settings from conf.npy file

In [12]:
# collect performance metrics from conf file 
conf_df = modeval.perf_from_conf(model_dir)

# Disclaimer: this does not consider the --min_samples criteria, all tasks are considered!

# melt: in conf files, there is only auc_pr and auc_roc (no kappa, f1, ...)
conf_dfm = modeval.melt_perf(conf_df, perf_metrics=['auc_pr_va', 'auc_va']) 

# Find out the best hyperparameters
modeval.best_hyperparam(conf_dfm)

Unnamed: 0,hp_epochs,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,score_type,value
6,10,2000,0.6,1e-05,0.001,5,auc_pr_va,0.715337
0,10,2000,0.6,1e-05,0.001,5,auc_va,0.770708


### Let's dive into the functions used here

There are two entry points to collect performance results from sparsechem:
- results/*-metrics.csv files 
- models/*-conf.npy files

Here we will fetch performance reports from both entry points and see the specifities of each: 

### 1/ Metrics files

In [13]:
# 1/ load the performance metrics from the *metrics.csv file in "results" folder
# 
# modeval.perf_from_metrics() --> loads performance reports from metrics files in result_dir including  
# It assumes the metrics filenames contain information about hyperparameters. 
# It extracts the hyperparameters settings present in the filename and adds columns to perf metrics report
# columns names of hyperparamters are prefixed with "_hp"
# 
# Ultimately: fetching hyperparameters from filenames is not very good practice and should be mitigated in the future
# Ideally: sparsechem should provide one performance report containing all numbers including HPs
# 


perf_metrics = modeval.perf_from_metrics(result_dir, verbose=True)
perf_metrics
# => is a dataframe containing one row per task, one column per perf metrics + columns containing metadata of models/task (e.g. HPs, num_pos, valid fold,.. )
# NB: the dtaframe contains a row for all tasks. No exceptions such as minimum number of samples of each class. 
#     ==> This assumes filtering of tasks with less than X positives and X negatives should be done manually


Loaded 30 metrics files


Unnamed: 0,task,num_pos,num_neg,num_pos_va,num_neg_va,auc_tr,auc_va,auc_pr_tr,auc_pr_va,avg_prec_tr,...,kappa_va,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,fold_va,fold_test,model
0,0,26,28,5,1,0.991182,0.600000,0.989486,0.918333,0.989719,...,0.076923,1600,0.6,1e-05,0.001,5,10,2,,Y
1,1,25,28,2,3,0.965217,0.333333,0.961833,0.291667,0.962680,...,-0.363636,1600,0.6,1e-05,0.001,5,10,2,,Y
2,2,25,27,3,4,0.911067,0.333333,0.923706,0.549206,0.925274,...,0.000000,1600,0.6,1e-05,0.001,5,10,2,,Y
3,3,28,29,5,2,0.913043,0.900000,0.862808,0.963333,0.867996,...,0.461538,1600,0.6,1e-05,0.001,5,10,2,,Y
4,4,29,27,22,25,1.000000,0.285455,1.000000,0.385190,1.000000,...,-0.042662,1600,0.6,1e-05,0.001,5,10,2,,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106405,3542,6353,5695,1389,1248,0.995582,0.930653,0.996392,0.938970,0.996392,...,0.696039,1200,0.6,1e-05,0.001,5,10,4,,Y
106406,3543,4038,8010,862,1775,0.995366,0.937612,0.991862,0.901494,0.991863,...,0.678433,1200,0.6,1e-05,0.001,5,10,4,,Y
106407,3544,2308,9740,495,2142,0.995813,0.941757,0.984444,0.830207,0.984448,...,0.595842,1200,0.6,1e-05,0.001,5,10,4,,Y
106408,3545,8736,3312,1934,703,0.993217,0.928366,0.997462,0.970711,0.997462,...,0.676348,1200,0.6,1e-05,0.001,5,10,4,,Y


In [14]:
perf_metrics.shape # nrows = ntasks * 5cv * n_hyperparameters

(106410, 24)

In [15]:
perf_metrics.columns

Index(['task', 'num_pos', 'num_neg', 'num_pos_va', 'num_neg_va', 'auc_tr',
       'auc_va', 'auc_pr_tr', 'auc_pr_va', 'avg_prec_tr', 'avg_prec_va',
       'max_f1_tr', 'max_f1_va', 'kappa_tr', 'kappa_va', 'hp_hidden_sizes',
       'hp_last_dropout', 'hp_weight_decay', 'hp_learning_rate',
       'hp_learning_steps', 'hp_epochs', 'fold_va', 'fold_test', 'model'],
      dtype='object')

In [21]:
# by default: number of folds is 5 (n_cv=5) and the argument verify is False
# however turning verify to True allows checking if for any of the tasks there are less than n_cv metrics reprots 
# this can be very usefull in case of extensive hyperparameter grid search to identify failed jobs. 

perf_metrics = modeval.perf_from_metrics(result_dir, verbose=True, verify=True)

# => by default, n_cv=5, the function expects folds 0,1,2,3,4 being run. 
# for the purpose of this demonstration, one result/model was removed from folder. See the warning message
# column on the far right states which folds were run, one can see fold 0 is missing

Loaded 29 metrics files
Fold runs found :
 hp_hidden_sizes  hp_last_dropout  hp_weight_decay  hp_learning_rate  hp_learning_steps  hp_epochs
1200             0.6              1e-05            0.001             5                  10           0,1,2,3,4
1200.1200        0.6              1e-05            0.001             5                  10           0,1,2,3,4
1600             0.6              1e-05            0.001             5                  10           0,1,2,3,4
1600.1600        0.6              1e-05            0.001             5                  10           0,1,2,3,4
2000             0.6              1e-05            0.001             5                  10           0,1,2,3,4
800              0.6              1e-05            0.001             5                  10             1,2,3,4
Name: fold_va, dtype: object


In [22]:
# If desired, one can also fetch perofrmance results for a subset of the tasks using the argument "tasks_for_eval":
# using task indices (columns in mtx files), it is possible to mask out some tasks
# i.e. let's fetch metrics for tasks 1, 23, 124


modeval.perf_from_metrics(result_dir, verbose=True, tasks_for_eval=[1,23,124])



Loaded 29 metrics files


Unnamed: 0,task,num_pos,num_neg,num_pos_va,num_neg_va,auc_tr,auc_va,auc_pr_tr,auc_pr_va,avg_prec_tr,...,kappa_va,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,fold_va,fold_test,model
0,1,25,28,2,3,0.965217,0.333333,0.961833,0.291667,0.962680,...,-0.363636,1600,0.6,1e-05,0.001,5,10,2,,Y
1,23,2857,3225,569,646,0.983075,0.784332,0.983510,0.798325,0.983513,...,0.401895,1600,0.6,1e-05,0.001,5,10,2,,Y
2,124,18158,23678,3821,5051,0.918262,0.606837,0.897007,0.539238,0.897012,...,0.127662,1600,0.6,1e-05,0.001,5,10,2,,Y
3,1,25,28,1,2,0.943910,0.500000,0.949771,0.250000,0.950684,...,0.000000,800,0.6,1e-05,0.001,5,10,4,,Y
4,23,2857,3225,575,675,0.973960,0.799170,0.974674,0.816002,0.974680,...,0.471448,800,0.6,1e-05,0.001,5,10,4,,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,23,2857,3225,575,675,0.987840,0.800600,0.987947,0.819158,0.987950,...,0.472464,2000,0.6,1e-05,0.001,5,10,4,,Y
83,124,18158,23678,3520,4596,0.926390,0.622052,0.907531,0.550745,0.907535,...,0.144137,2000,0.6,1e-05,0.001,5,10,4,,Y
84,1,25,28,1,2,0.958333,1.000000,0.951286,1.000000,0.952450,...,0.000000,1200,0.6,1e-05,0.001,5,10,4,,Y
85,23,2857,3225,575,675,0.980500,0.800433,0.980673,0.817476,0.980677,...,0.477558,1200,0.6,1e-05,0.001,5,10,4,,Y


### 2/ Config files

In the models/\*.conf.npy files, there are actually two performance reports: 
- individual tasks performance reports
- aggregate performance reports (average over all tasks)

In addition, conf.npy files contain all settings used for training the model. 

#### individual tasks performance 

In [24]:
# 2/ collect the performance scores from the *conf.npy in the "models" folder

# a/ Lets get performance of each individual task. 
perf_conf = modeval.perf_from_conf(model_dir, aggregate=False)
perf_conf

# this will lead into a similar data frame compared to modeval.perf_from_metrics(), one row per task, one columns per metrics/metadatas
# Hyperparamter column names are prefixed by "hp_" 

# NB: performance are reported for all tasks, however not all the metrics are present in conf files (see columns names below)


Unnamed: 0,task,fold_te,fold_va,hp_epochs,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,n_tasks_eval,min_samples,auc_va,auc_pr_va,model
0,0,,3,10,1200,0.6,0.00001,0.001,5,3547,50,0.750000,0.731250,Y
1,1,,3,10,1200,0.6,0.00001,0.001,5,3547,50,1.000000,1.000000,Y
2,2,,3,10,1200,0.6,0.00001,0.001,5,3547,50,0.750000,0.633333,Y
3,3,,3,10,1200,0.6,0.00001,0.001,5,3547,50,0.793651,0.691927,Y
4,4,,3,10,1200,0.6,0.00001,0.001,5,3547,50,,,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102858,3542,,2,10,800,0.6,0.00001,0.001,5,3547,50,0.919514,0.918624,Y
102859,3543,,2,10,800,0.6,0.00001,0.001,5,3547,50,0.925979,0.869812,Y
102860,3544,,2,10,800,0.6,0.00001,0.001,5,3547,50,0.932107,0.816825,Y
102861,3545,,2,10,800,0.6,0.00001,0.001,5,3547,50,0.923566,0.968926,Y


In [25]:
perf_conf.shape

(102863, 14)

In [26]:
# columns in perf from metrics file: max_f1, kappa, avg_prec are here
perf_metrics.columns

Index(['task', 'num_pos', 'num_neg', 'num_pos_va', 'num_neg_va', 'auc_tr',
       'auc_va', 'auc_pr_tr', 'auc_pr_va', 'avg_prec_tr', 'avg_prec_va',
       'max_f1_tr', 'max_f1_va', 'kappa_tr', 'kappa_va', 'hp_hidden_sizes',
       'hp_last_dropout', 'hp_weight_decay', 'hp_learning_rate',
       'hp_learning_steps', 'hp_epochs', 'fold_va', 'fold_test', 'model'],
      dtype='object')

In [27]:
# columns in perf from metrics file: max_f1, kappa, avg_prec are NOT here!
# no perf metrics are reported on training set because function does not fetch results for training
# POSSIBLE ADD ON: could be possible to add training metrics
perf_conf.columns

Index(['task', 'fold_te', 'fold_va', 'hp_epochs', 'hp_hidden_sizes',
       'hp_last_dropout', 'hp_weight_decay', 'hp_learning_rate',
       'hp_learning_steps', 'n_tasks_eval', 'min_samples', 'auc_va',
       'auc_pr_va', 'model'],
      dtype='object')

In [28]:
# Similarly to modeval.perf_from_metrics(), if desired, one can also fetch perofrmance results for a subset of the tasks using the argument "tasks_for_eval":
# using task indices (columns in mtx files), it is possible to mask out some tasks
# i.e. let's fetch metrics for tasks 1, 23, 123

modeval.perf_from_conf(model_dir, aggregate=False, tasks_for_eval=[1,23,123])


Unnamed: 0,task,fold_te,fold_va,hp_epochs,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,n_tasks_eval,min_samples,auc_va,auc_pr_va,model
0,1,,3,10,1200,0.6,0.00001,0.001,5,3,50,1.000000,1.000000,Y
1,23,,3,10,1200,0.6,0.00001,0.001,5,3,50,0.841350,0.874569,Y
2,123,,3,10,1200,0.6,0.00001,0.001,5,3,50,0.641566,0.059053,Y
3,1,,4,10,16001600,0.6,0.00001,0.001,5,3,50,0.500000,0.250000,Y
4,23,,4,10,16001600,0.6,0.00001,0.001,5,3,50,0.800549,0.811362,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,23,,4,10,1200,0.6,0.00001,0.001,5,3,50,0.800433,0.817476,Y
83,123,,4,10,1200,0.6,0.00001,0.001,5,3,50,0.671419,0.098121,Y
84,1,,2,10,800,0.6,0.00001,0.001,5,3,50,0.500000,0.333333,Y
85,23,,2,10,800,0.6,0.00001,0.001,5,3,50,0.788032,0.803267,Y


#### Aggregate performance report

In [29]:
# b/ we can also directly load performance aggregates (averaged over the tasks) from the conf file
# this is done turing the argument "aggregate" to True

perf_conf_agg = modeval.perf_from_conf(model_dir, aggregate=True)
perf_conf_agg

# the function actually fetches the aggregate performance report from the conf file and does not the aggregation iteself. 
# Aggregation is done by sparsechem/ 

# ! NB: here the aggregation considers only tasks verifying the --min_sample option 
# that is, if --min_sample is in use with N_MIN=50, this will report a different result that if the aggrgation is done over all tasks

Unnamed: 0,fold_te,fold_va,hp_epochs,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,min_samples,auc_va_mean,auc_pr_va_mean,train_time_1epochs,model
0,,3,10,1200,0.6,1e-05,0.001,5,50,0.768198,0.684172,11.497573,Y
1,,4,10,16001600,0.6,1e-05,0.001,5,50,0.76279,0.668862,13.704524,Y
2,,3,10,800,0.6,1e-05,0.001,5,50,0.762279,0.678674,11.026484,Y
3,,1,10,1200,0.6,1e-05,0.001,5,50,0.771758,0.688737,12.120908,Y
4,,0,10,12001200,0.6,1e-05,0.001,5,50,0.761812,0.675108,11.026914,Y
5,,1,10,800,0.6,1e-05,0.001,5,50,0.76676,0.683282,11.434556,Y
6,,4,10,2000,0.6,1e-05,0.001,5,50,0.775669,0.683442,19.052484,Y
7,,0,10,16001600,0.6,1e-05,0.001,5,50,0.765586,0.679157,18.269143,Y
8,,3,10,16001600,0.6,1e-05,0.001,5,50,0.755477,0.672579,13.587526,Y
9,,2,10,2000,0.6,1e-05,0.001,5,50,0.776622,0.692818,11.254517,Y


In [30]:
# metling can also be done with perf_metrics, but need to specify metrics columns
modeval.melt_perf(perf_metrics, perf_metrics=['auc_pr_va'])

Unnamed: 0,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,score_type,value
822904,1600,0.6,1e-05,0.001,5,10,auc_pr_va,0.918333
822905,1600,0.6,1e-05,0.001,5,10,auc_pr_va,0.291667
822906,1600,0.6,1e-05,0.001,5,10,auc_pr_va,0.549206
822907,1600,0.6,1e-05,0.001,5,10,auc_pr_va,0.963333
822908,1600,0.6,1e-05,0.001,5,10,auc_pr_va,0.385190
...,...,...,...,...,...,...,...,...
925762,1200,0.6,1e-05,0.001,5,10,auc_pr_va,0.938970
925763,1200,0.6,1e-05,0.001,5,10,auc_pr_va,0.901494
925764,1200,0.6,1e-05,0.001,5,10,auc_pr_va,0.830207
925765,1200,0.6,1e-05,0.001,5,10,auc_pr_va,0.970711


### Functions to manipulate performance metrics data frames: 

In [50]:
# aggregate performance over folds
modeval.aggregate_fold_perf?

Signature: modeval.aggregate_fold_perf(metrics_df, min_samples, n_cv=5, verify=True)<br>
Docstring:<br>
HP performance aggregation over folds. <br>
    From the metrics dataframe yielded by perf_from_metrics(), does the aggregation over the fold (mean, std) results in one perf per fold.<br>
\#     :param pandas df metrics_df: metrics dataframe yielded by perf_from_metrics() <br>
\#     :param int min_sample: minimum number of each class (overal) to be considered in mean<br>
\#     :param int n_cv: number of folds to look for<br>
\#     :param bool verify: checks for missing folds runs in CV and prints a report if missing jobs<br>
\#     :return dtype: pandas df containing performance per task aggregated over each fold<br>
    
File:      ~/Projects/Codes/repos/ml_tools/modeval.py<br>
Type:      function<br>

In [33]:
modeval.aggregate_fold_perf(perf_metrics, min_samples=50, n_cv=5)
# results in one row per fold. 

Fold runs found :
 hp_hidden_sizes  hp_last_dropout  hp_weight_decay  hp_learning_rate  hp_learning_steps  hp_epochs
1200             0.6              1e-05            0.001             5                  10           0,1,2,3,4
1200.1200        0.6              1e-05            0.001             5                  10           0,1,2,3,4
1600             0.6              1e-05            0.001             5                  10           0,1,2,3,4
1600.1600        0.6              1e-05            0.001             5                  10           0,1,2,3,4
2000             0.6              1e-05            0.001             5                  10           0,1,2,3,4
800              0.6              1e-05            0.001             5                  10             1,2,3,4
Name: fold_va, dtype: object


Unnamed: 0,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,fold_va,auc_tr_mean,auc_va_mean,auc_pr_tr_mean,...,auc_tr_stdev,auc_va_stdev,auc_pr_tr_stdev,auc_pr_va_stdev,avg_prec_tr_stdev,avg_prec_va_stdev,max_f1_tr_stdev,max_f1_va_stdev,kappa_tr_stdev,kappa_va_stdev
0,1200.0,0.6,1e-05,0.001,5,10,0,0.961219,0.774678,0.927044,...,0.028279,0.137524,0.097744,0.236991,0.097287,0.22911,0.093311,0.194231,0.218171,0.247868
1,1200.0,0.6,1e-05,0.001,5,10,1,0.961401,0.771758,0.926295,...,0.028291,0.134309,0.099782,0.233034,0.099346,0.225772,0.095108,0.188785,0.218743,0.235323
2,1200.0,0.6,1e-05,0.001,5,10,2,0.961116,0.773722,0.926144,...,0.028517,0.13504,0.099932,0.233986,0.099433,0.227611,0.094862,0.192266,0.219516,0.242795
3,1200.0,0.6,1e-05,0.001,5,10,3,0.960906,0.768198,0.925778,...,0.028198,0.136873,0.101457,0.237359,0.101034,0.230043,0.095919,0.191981,0.220306,0.247355
4,1200.0,0.6,1e-05,0.001,5,10,4,0.960689,0.770734,0.925817,...,0.028646,0.138737,0.10065,0.23875,0.09927,0.231398,0.093941,0.195755,0.21868,0.245312
5,1200.12,0.6,1e-05,0.001,5,10,0,0.934098,0.761812,0.884393,...,0.039655,0.140135,0.121464,0.239865,0.120751,0.232156,0.108528,0.197589,0.222658,0.248967
6,1200.12,0.6,1e-05,0.001,5,10,1,0.933499,0.756952,0.882561,...,0.040598,0.137541,0.122614,0.237927,0.121883,0.230684,0.109937,0.193407,0.223759,0.238436
7,1200.12,0.6,1e-05,0.001,5,10,2,0.933775,0.758718,0.883106,...,0.039724,0.142257,0.120533,0.23845,0.119799,0.232321,0.107504,0.196114,0.221988,0.242792
8,1200.12,0.6,1e-05,0.001,5,10,3,0.934217,0.751511,0.883159,...,0.03944,0.140745,0.12236,0.241006,0.12162,0.233173,0.109269,0.19428,0.22167,0.250163
9,1200.12,0.6,1e-05,0.001,5,10,4,0.932434,0.758403,0.88154,...,0.040545,0.137141,0.124202,0.240192,0.12269,0.233611,0.109231,0.20014,0.22527,0.247525


In [35]:
# aggregate performance over all dataframe
modeval.aggregate_overall?

Signature: modeval.aggregate_overall(metrics_df, min_samples)<br>
Docstring:<br>
HP performance aggregation overall . <br>
    From the metrics dataframe yielded by perf_from_metrics(), does the aggregation over the CV (mean, std) results in one perf hyperparameter.<br>
\#     :param pandas df metrics_df: metrics dataframe yielded by perf_from_metrics() <br>
\#     :param int min_sample: minimum number of each class (overal) to be considered in mean<br>
\#     :return dtype: pandas df containing performance per hyperparameter setting<br>
    
File:      ~/Projects/Codes/repos/ml_tools/modeval.py
Type:      function

In [34]:
modeval.aggregate_overall(perf_metrics, min_samples=50)

# produces one row for all 
# performance contains mean metrics + standard deviations.. 

Unnamed: 0,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,auc_tr_mean,auc_va_mean,auc_pr_tr_mean,auc_pr_va_mean,...,auc_tr_stdev,auc_va_stdev,auc_pr_tr_stdev,auc_pr_va_stdev,avg_prec_tr_stdev,avg_prec_va_stdev,max_f1_tr_stdev,max_f1_va_stdev,kappa_tr_stdev,kappa_va_stdev
0,1200.0,0.6,1e-05,0.001,5,10,0.961066,0.771824,0.926216,0.685607,...,0.02838,0.136492,0.099895,0.23601,0.099256,0.228769,0.094609,0.19259,0.219033,0.243737
1,1200.12,0.6,1e-05,0.001,5,10,0.933605,0.757488,0.882952,0.671821,...,0.03999,0.139585,0.122212,0.239462,0.121323,0.23236,0.108871,0.196302,0.223026,0.245616
2,1600.0,0.6,1e-05,0.001,5,10,0.970499,0.77449,0.943357,0.689215,...,0.024052,0.137004,0.08339,0.235837,0.082987,0.228695,0.082661,0.192344,0.211113,0.245386
3,1600.16,0.6,1e-05,0.001,5,10,0.945301,0.761371,0.902014,0.675687,...,0.035536,0.1387,0.108026,0.238484,0.107297,0.231473,0.099157,0.195568,0.210451,0.245656
4,2000.0,0.6,1e-05,0.001,5,10,0.976634,0.77579,0.954825,0.690184,...,0.020982,0.136788,0.071859,0.235691,0.071493,0.228602,0.074068,0.192143,0.203972,0.245132
5,800.0,0.6,1e-05,0.001,5,10,0.944885,0.766297,0.898708,0.680168,...,0.034308,0.135993,0.122739,0.235726,0.122009,0.228974,0.111045,0.192898,0.2301,0.242203


In [36]:
modeval.aggregate_overall(perf_metrics, min_samples=50).columns

Index(['hp_hidden_sizes', 'hp_last_dropout', 'hp_weight_decay',
       'hp_learning_rate', 'hp_learning_steps', 'hp_epochs', 'auc_tr_mean',
       'auc_va_mean', 'auc_pr_tr_mean', 'auc_pr_va_mean', 'avg_prec_tr_mean',
       'avg_prec_va_mean', 'max_f1_tr_mean', 'max_f1_va_mean', 'kappa_tr_mean',
       'kappa_va_mean', 'auc_tr_stdev', 'auc_va_stdev', 'auc_pr_tr_stdev',
       'auc_pr_va_stdev', 'avg_prec_tr_stdev', 'avg_prec_va_stdev',
       'max_f1_tr_stdev', 'max_f1_va_stdev', 'kappa_tr_stdev',
       'kappa_va_stdev'],
      dtype='object')

In [37]:
# aggregate performance over folds --> get average per task
modeval.aggregate_task_perf(perf_metrics, min_samples=50)
# produce one row for each task, averaged over the folds. 
# performance contains mean metrics + standard deviations.. 

Fold runs found :
 hp_hidden_sizes  hp_last_dropout  hp_weight_decay  hp_learning_rate  hp_learning_steps  hp_epochs
1200             0.6              1e-05            0.001             5                  10           0,1,2,3,4
1200.1200        0.6              1e-05            0.001             5                  10           0,1,2,3,4
1600             0.6              1e-05            0.001             5                  10           0,1,2,3,4
1600.1600        0.6              1e-05            0.001             5                  10           0,1,2,3,4
2000             0.6              1e-05            0.001             5                  10           0,1,2,3,4
800              0.6              1e-05            0.001             5                  10             1,2,3,4
Name: fold_va, dtype: object


Unnamed: 0,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,task,num_pos,num_neg,num_pos_va_mean,...,auc_tr_stdev,auc_va_stdev,auc_pr_tr_stdev,auc_pr_va_stdev,avg_prec_tr_stdev,avg_prec_va_stdev,max_f1_tr_stdev,max_f1_va_stdev,kappa_tr_stdev,kappa_va_stdev
0,1200,0.6,1e-05,0.001,5,10,8,1444,1369,288.80,...,0.001335,0.011792,0.000959,0.010646,0.000959,0.010612,0.000795,0.022645,0.015853,0.019543
1,1200,0.6,1e-05,0.001,5,10,9,398,2415,79.60,...,0.003289,0.021780,0.011920,0.033566,0.011887,0.033207,0.005143,0.036967,0.008726,0.040361
2,1200,0.6,1e-05,0.001,5,10,10,79,2734,15.80,...,0.004501,0.049662,0.013282,0.011389,0.012772,0.011252,0.017464,0.040766,0.000000,0.000000
3,1200,0.6,1e-05,0.001,5,10,12,2857,3225,571.40,...,0.002007,0.022152,0.002288,0.031429,0.002287,0.031393,0.005530,0.033225,0.004874,0.045109
4,1200,0.6,1e-05,0.001,5,10,13,393,5689,78.60,...,0.001956,0.021340,0.007604,0.049101,0.007596,0.048672,0.004357,0.054551,0.030787,0.037813
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9043,800,0.6,1e-05,0.001,5,10,3542,6353,5695,1257.75,...,0.000617,0.009524,0.000532,0.013329,0.000532,0.013317,0.002076,0.012470,0.002473,0.021699
9044,800,0.6,1e-05,0.001,5,10,3543,4038,8010,798.25,...,0.000247,0.006158,0.000410,0.015494,0.000410,0.015472,0.001830,0.004564,0.006805,0.011296
9045,800,0.6,1e-05,0.001,5,10,3544,2308,9740,459.00,...,0.000170,0.003425,0.000554,0.012833,0.000554,0.012814,0.001887,0.011327,0.007667,0.022360
9046,800,0.6,1e-05,0.001,5,10,3545,8736,3312,1745.00,...,0.000188,0.004764,0.000083,0.001573,0.000083,0.001573,0.000907,0.008341,0.002413,0.014117
