In [14]:
# Author: noe.sturm@novartis.com

In [3]:
import os, sys
import numpy as np
import pandas as pd
import importlib
from scipy.io import mmread
sys.path.append('/path/to/repo/performance_evaluation/development_code/')
import modeval


In [None]:
importlib.reload(modeval) # dev stuff

In [94]:
result_dir = 'results_sparsechem/'
model_dir = 'models/'

### Get the best hyperparameter settings from metrics.csv files

In [103]:
# collect performance metrics from metrics file. 
# this assumes metrics files are named after the hyperparameters (see below into details of perf_from_metrics() )
metrics_df = modeval.perf_from_metrics(result_dir, verbose=True)

# Disclaimer: this does not consider the --min_samples criteria, all tasks are considered!

# melt 
metrics_dfm = modeval.melt_perf(perf_metrics, perf_metrics=['auc_pr_va', 'auc_va', 'max_f1_va', 'kappa_va', 'avg_prec_va'])

# Find out the best hyperparameters: one row per score type gives the best HP per score type
modeval.best_hyperparam(metrics_dfm)

Loaded 1 metrics files


Unnamed: 0,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,score_type,value
3,400,0.2,1e-05,0.001,10,20,auc_pr_va,0.718885
0,400,0.2,1e-05,0.001,10,20,auc_va,0.77282
2,400,0.2,1e-05,0.001,10,20,avg_prec_va,0.737953
4,400,0.2,1e-05,0.001,10,20,kappa_va,0.295114
1,400,0.2,1e-05,0.001,10,20,max_f1_va,0.758979


### Get the best hyperparameter settings from conf.npy file

In [106]:
# collect performance metrics from conf file 
conf_df = modeval.perf_from_conf(model_dir)

# Disclaimer: this does not consider the --min_samples criteria, all tasks are considered!

# melt: in conf files, there is only auc_pr and auc_roc (no kappa, f1, ...)
conf_dfm = modeval.melt_perf(conf_df, perf_metrics=['auc_pr_va', 'auc_va']) 

# Find out the best hyperparameters
modeval.best_hyperparam(conf_dfm)

Unnamed: 0,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,score_type,value
1,400,0.2,1e-05,0.001,10,20,auc_pr_va,0.718885
0,400,0.2,1e-05,0.001,10,20,auc_va,0.77282


### Let's dive into the functions used here

There are two entry points to collect performance results from sparsechem:
- results/*-metrics.csv files 
- models/*-conf.npy files

Here we will fetch performance reports from both entry points and see the specifities of each: 

### 1/ Metrics files

In [20]:
# 1/ load the performance metrics from the *metrics.csv file in "results" folder
# 
# modeval.perf_from_metrics() --> loads performance reports from metrics files in result_dir including  
# It assumes the metrics filenames contain information about hyperparameters. 
# It extracts the hyperparameters settings present in the filename and adds columns to perf metrics report
# columns names of hyperparamters are prefixed with "_hp"
# 
# Ultimately: fetching hyperparameters from filenames is not very good practice and should be mitigated in the future
# Ideally: sparsechem should provide one performance report containing all numbers including HPs
# 


perf_metrics = modeval.perf_from_metrics(result_dir, verbose=True)
perf_metrics
# => is a dataframe containing one row per task, one column per perf metrics + columns containing metadata of models/task (e.g. HPs, num_pos, valid fold,.. )
# NB: the dtaframe contains a row for all tasks. No exceptions such as minimum number of samples of each class. 
#     ==> This assumes filtering of tasks with less than X positives and X negatives should be done manually

# NB2: task 4 is missing validation metrics because it is not represented in the validation fold

Loaded 1 metrics files


Unnamed: 0,task,num_pos,num_neg,num_pos_va,num_neg_va,auc_tr,auc_va,auc_pr_tr,auc_pr_va,avg_prec_tr,...,kappa_va,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,fold_va,fold_test,model
0,0,26,28,10,7,0.994048,0.728571,0.992851,0.823777,0.993056,...,0.328947,400,0.2,1e-05,0.001,10,20,0,,Y
1,1,25,28,5,11,0.967647,0.781818,0.969932,0.458889,0.970786,...,0.522388,400,0.2,1e-05,0.001,10,20,0,,Y
2,2,25,27,4,2,0.908571,0.500000,0.901868,0.579167,0.904375,...,0.000000,400,0.2,1e-05,0.001,10,20,0,,Y
3,3,28,29,10,13,0.868056,0.807692,0.799224,0.813080,0.814172,...,0.530612,400,0.2,1e-05,0.001,10,20,0,,Y
4,4,29,27,0,0,0.950192,,0.961724,,0.962284,...,,400,0.2,1e-05,0.001,10,20,0,,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3542,3542,6353,5695,1322,1099,0.998651,0.923334,0.998866,0.939423,0.998866,...,0.685518,400,0.2,1e-05,0.001,10,20,0,,Y
3543,3543,4038,8010,845,1576,0.998554,0.929600,0.997475,0.898452,0.997475,...,0.682354,400,0.2,1e-05,0.001,10,20,0,,Y
3544,3544,2308,9740,472,1949,0.998740,0.936893,0.995308,0.834398,0.995309,...,0.646414,400,0.2,1e-05,0.001,10,20,0,,Y
3545,3545,8736,3312,1756,665,0.997385,0.939993,0.999029,0.974911,0.999029,...,0.728763,400,0.2,1e-05,0.001,10,20,0,,Y


In [39]:
perf_metrics.shape

(3547, 24)

In [15]:
perf_metrics.columns

Index(['task', 'num_pos', 'num_neg', 'num_pos_va', 'num_neg_va', 'auc_tr',
       'auc_va', 'auc_pr_tr', 'auc_pr_va', 'avg_prec_tr', 'avg_prec_va',
       'max_f1_tr', 'max_f1_va', 'kappa_tr', 'kappa_va', 'hp_hidden_sizes',
       'hp_last_dropout', 'hp_weight_decay', 'hp_learning_rate',
       'hp_learning_steps', 'hp_epochs', 'fold_va', 'fold_test', 'model'],
      dtype='object')

In [24]:
# by default: number of folds is 5 (n_cv=5) and the argument verify is False
# however turning verify to True allows checking if for any of the tasks there are less than n_cv metrics reprots 
# this can be very usefull in case of extensive hyperparameter grid search to identify failed jobs. 

perf_metrics = modeval.perf_from_metrics(result_dir, verbose=True, verify=True)

# => by default, n_cv=5, the function expects folds 0,1,2,3,4 being rung. 
# since this toy metrics file is the result of one run in the CV, missing fold runs are identified for the listed HPs.
# NB: only "0" is listed on the far right (e.g. in case folds 0, 1 and 2 were run this would look like: 0,1,2). 

Loaded 1 metrics files
Fold runs found :
 hp_hidden_sizes  hp_last_dropout  hp_weight_decay  hp_learning_rate  hp_learning_steps  hp_epochs
400              0.2              1e-05            0.001             10                 20           0
Name: fold_va, dtype: object


In [33]:
# If desired, one can also fetch perofrmance results for a subset of the tasks using the argument "tasks_for_eval":
# using task indices (columns in mtx files), it is possible to mask out some tasks
# i.e. let's fetch metrics for tasks 1, 23, 124


modeval.perf_from_metrics(result_dir, verbose=True, tasks_for_eval=[1,23,124])



Loaded 1 metrics files


Unnamed: 0,task,num_pos,num_neg,num_pos_va,num_neg_va,auc_tr,auc_va,auc_pr_tr,auc_pr_va,avg_prec_tr,...,kappa_va,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,fold_va,fold_test,model
0,1,25,28,5,11,0.967647,0.781818,0.969932,0.458889,0.970786,...,0.522388,400,0.2,1e-05,0.001,10,20,0,,Y
1,23,2857,3225,612,732,0.991375,0.801616,0.991299,0.811011,0.991301,...,0.490311,400,0.2,1e-05,0.001,10,20,0,,Y
2,124,18158,23678,3735,4798,0.940132,0.604163,0.924708,0.532283,0.924712,...,0.121772,400,0.2,1e-05,0.001,10,20,0,,Y


### 2/ Config files

In the models/\*.conf.npy files, there are actually two performance reports: 
- individual tasks performance reports
- aggregate performance reports (average over all tasks)

In addition, conf.npy files contain all settings used for training the model. 

#### individual tasks performance 

In [25]:
# 2/ collect the performance scores from the *conf.npy in the "models" folder

# a/ Lets get performance of each individual task. 
perf_conf = modeval.perf_from_conf(model_dir, aggregate=False)
perf_conf

# this will lead into a similar data frame compared to modeval.perf_from_metrics(), one row per task, one columns per metrics/metadatas
# Hyperparamter column names are prefixed by "hp_" 

# NB: performance are reported for all tasks, however not all the metrics are present in conf files (see columns names below)
# NB2: Note task 4 has NaN auc_pr_va and auc_va --> task 4 is not present in validation fold 0

Unnamed: 0,task,fold_te,fold_va,hp_epochs,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,n_tasks_eval,min_samples,auc_va,auc_pr_va,model
0,0,,0,20,400,0.2,0.0,0.001,10,3547,25,0.785714,0.838308,Y
1,1,,0,20,400,0.2,0.0,0.001,10,3547,25,0.672727,0.361825,Y
2,2,,0,20,400,0.2,0.0,0.001,10,3547,25,0.500000,0.579167,Y
3,3,,0,20,400,0.2,0.0,0.001,10,3547,25,0.838462,0.829287,Y
4,4,,0,20,400,0.2,0.0,0.001,10,3547,25,,,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3542,3542,,0,20,400,0.2,0.0,0.001,10,3547,25,0.922612,0.939367,Y
3543,3543,,0,20,400,0.2,0.0,0.001,10,3547,25,0.926225,0.896534,Y
3544,3544,,0,20,400,0.2,0.0,0.001,10,3547,25,0.935405,0.830107,Y
3545,3545,,0,20,400,0.2,0.0,0.001,10,3547,25,0.935401,0.973494,Y


In [38]:
perf_conf.shape

(3547, 14)

In [41]:
# columns in perf from metrics file: max_f1, kappa, avg_prec are here
perf_metrics.columns

Index(['task', 'num_pos', 'num_neg', 'num_pos_va', 'num_neg_va', 'auc_tr',
       'auc_va', 'auc_pr_tr', 'auc_pr_va', 'avg_prec_tr', 'avg_prec_va',
       'max_f1_tr', 'max_f1_va', 'kappa_tr', 'kappa_va', 'hp_hidden_sizes',
       'hp_last_dropout', 'hp_weight_decay', 'hp_learning_rate',
       'hp_learning_steps', 'hp_epochs', 'fold_va', 'fold_test', 'model'],
      dtype='object')

In [40]:
# columns in perf from metrics file: max_f1, kappa, avg_prec are NOT here!
# no perf metrics are reported on training set because function does not fetch results for training
# POSSIBLE ADD ON: could be possible to add training metrics
perf_conf.columns

Index(['task', 'fold_te', 'fold_va', 'hp_epochs', 'hp_hidden_sizes',
       'hp_last_dropout', 'hp_weight_decay', 'hp_learning_rate',
       'hp_learning_steps', 'n_tasks_eval', 'min_samples', 'auc_va',
       'auc_pr_va', 'model'],
      dtype='object')

In [36]:
# Similarly to modeval.perf_from_metrics(), if desired, one can also fetch perofrmance results for a subset of the tasks using the argument "tasks_for_eval":
# using task indices (columns in mtx files), it is possible to mask out some tasks
# i.e. let's fetch metrics for tasks 1, 23, 123

modeval.perf_from_conf(model_dir, aggregate=False, tasks_for_eval=[1,23,123])


Unnamed: 0,task,fold_te,fold_va,hp_epochs,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,n_tasks_eval,min_samples,auc_va,auc_pr_va,model
0,1,,0,20,400,0.2,0.0,0.001,10,3,25,0.672727,0.361825,Y
1,23,,0,20,400,0.2,0.0,0.001,10,3,25,0.805453,0.8127,Y
2,123,,0,20,400,0.2,0.0,0.001,10,3,25,0.6903,0.134965,Y


#### Aggregate performance report

In [43]:
# b/ we can also directly load performance aggregates (averaged over the tasks) from the conf file
# this is done turing the argument "aggregate" to True

perf_conf_agg = modeval.perf_from_conf(model_dir, aggregate=True)
perf_conf_agg

# the function actually fetches the aggregate performance report from the conf file and does not the aggregation iteself. 
# Aggregation is done by sparsechem/ 

# ! NB: here the aggregation considers only tasks verifying the --min_sample option 
# that is, if --min_sample is in use with N_MIN=50, this will report a different result that if the aggrgation is done over all tasks

Unnamed: 0,fold_te,fold_va,hp_epochs,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,min_samples,auc_va_mean,auc_pr_va_mean,max_f1_va_mean,kappa_va_mean,train_time_1epochs,model
0,,0,20,400,0.2,0.0,0.001,10,25,0.770633,0.71674,0.757492,0.303348,12.306314,Y


Unnamed: 0,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,variable,value
2,400,0.2,1e-05,0.001,10,20,auc_pr_va,0.718885
1,400,0.2,1e-05,0.001,10,20,avg_prec_va,0.737953
3,400,0.2,1e-05,0.001,10,20,kappa_va,0.295114
0,400,0.2,1e-05,0.001,10,20,max_f1_va,0.758979


In [62]:
# metling can also be done with perf_metrics, but need to specify metrics columns
modeval.melt_perf(perf_metrics, perf_metrics=['auc_pr_va'])

Unnamed: 0,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,fold_va,variable,value
28376,400,0.2,1e-05,0.001,10,20,0,auc_pr_va,0.823777
28377,400,0.2,1e-05,0.001,10,20,0,auc_pr_va,0.458889
28378,400,0.2,1e-05,0.001,10,20,0,auc_pr_va,0.579167
28379,400,0.2,1e-05,0.001,10,20,0,auc_pr_va,0.813080
28380,400,0.2,1e-05,0.001,10,20,0,auc_pr_va,
...,...,...,...,...,...,...,...,...,...
31918,400,0.2,1e-05,0.001,10,20,0,auc_pr_va,0.939423
31919,400,0.2,1e-05,0.001,10,20,0,auc_pr_va,0.898452
31920,400,0.2,1e-05,0.001,10,20,0,auc_pr_va,0.834398
31921,400,0.2,1e-05,0.001,10,20,0,auc_pr_va,0.974911


### Functions to manipulate performance metrics data frames: 

In [50]:
# aggregate performance over folds
modeval.aggregate_fold_perf?

Signature: modeval.aggregate_fold_perf(metrics_df, min_samples, n_cv=5, verify=True)<br>
Docstring:<br>
HP performance aggregation over folds. <br>
    From the metrics dataframe yielded by perf_from_metrics(), does the aggregation over the fold (mean, std) results in one perf per fold.<br>
\#     :param pandas df metrics_df: metrics dataframe yielded by perf_from_metrics() <br>
\#     :param int min_sample: minimum number of each class (overal) to be considered in mean<br>
\#     :param int n_cv: number of folds to look for<br>
\#     :param bool verify: checks for missing folds runs in CV and prints a report if missing jobs<br>
\#     :return dtype: pandas df containing performance per task aggregated over each fold<br>
    
File:      ~/Projects/Codes/repos/ml_tools/modeval.py<br>
Type:      function<br>

In [53]:
modeval.aggregate_fold_perf(perf_metrics, min_samples=50, n_cv=1)
# results in one row per fold. Obviousely, given the toy data was only run over one fold, we obtain only one row. 

Unnamed: 0,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,fold_va,auc_tr_mean,auc_va_mean,auc_pr_tr_mean,...,auc_tr_stdev,auc_va_stdev,auc_pr_tr_stdev,auc_pr_va_stdev,avg_prec_tr_stdev,avg_prec_va_stdev,max_f1_tr_stdev,max_f1_va_stdev,kappa_tr_stdev,kappa_va_stdev
0,400,0.2,1e-05,0.001,10,20,0,0.975565,0.777175,0.956839,...,0.023477,0.138973,0.069231,0.237505,0.06889,0.23073,0.071605,0.19585,0.1801,0.250562


In [55]:
# aggregate performance over all dataframe
modeval.aggregate_overall?

[0;31mSignature:[0m [0mmodeval[0m[0;34m.[0m[0maggregate_overall[0m[0;34m([0m[0mmetrics_df[0m[0;34m,[0m [0mmin_samples[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
HP performance aggregation overall . 
    From the metrics dataframe yielded by perf_from_metrics(), does the aggregation over the CV (mean, std) results in one perf hyperparameter.
#     :param pandas df metrics_df: metrics dataframe yielded by perf_from_metrics() 
#     :param int min_sample: minimum number of each class (overal) to be considered in mean
#     :return dtype: pandas df containing performance per hyperparameter setting
    
[0;31mFile:[0m      ~/Projects/Codes/repos/performance_evaluation/development/modeval.py
[0;31mType:[0m      function


Signature: modeval.aggregate_overall(metrics_df, min_samples)<br>
Docstring:<br>
HP performance aggregation overall . <br>
    From the metrics dataframe yielded by perf_from_metrics(), does the aggregation over the CV (mean, std) results in one perf hyperparameter.<br>
\#     :param pandas df metrics_df: metrics dataframe yielded by perf_from_metrics() <br>
\#     :param int min_sample: minimum number of each class (overal) to be considered in mean<br>
\#     :return dtype: pandas df containing performance per hyperparameter setting<br>
    
File:      ~/Projects/Codes/repos/ml_tools/modeval.py
Type:      function

In [56]:
modeval.aggregate_overall(perf_metrics, min_samples=50)

# produces one row for all 
# performance contains mean metrics + standard deviations.. 

Unnamed: 0,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,auc_tr_mean,auc_va_mean,auc_pr_tr_mean,auc_pr_va_mean,...,auc_tr_stdev,auc_va_stdev,auc_pr_tr_stdev,auc_pr_va_stdev,avg_prec_tr_stdev,avg_prec_va_stdev,max_f1_tr_stdev,max_f1_va_stdev,kappa_tr_stdev,kappa_va_stdev
0,400,0.2,1e-05,0.001,10,20,0.975565,0.777175,0.956839,0.691453,...,0.023477,0.138973,0.069231,0.237505,0.06889,0.23073,0.071605,0.19585,0.1801,0.250562


In [57]:
modeval.aggregate_overall(perf_metrics, min_samples=50).columns

Index(['hp_hidden_sizes', 'hp_last_dropout', 'hp_weight_decay',
       'hp_learning_rate', 'hp_learning_steps', 'hp_epochs', 'auc_tr_mean',
       'auc_va_mean', 'auc_pr_tr_mean', 'auc_pr_va_mean', 'avg_prec_tr_mean',
       'avg_prec_va_mean', 'max_f1_tr_mean', 'max_f1_va_mean', 'kappa_tr_mean',
       'kappa_va_mean', 'auc_tr_stdev', 'auc_va_stdev', 'auc_pr_tr_stdev',
       'auc_pr_va_stdev', 'avg_prec_tr_stdev', 'avg_prec_va_stdev',
       'max_f1_tr_stdev', 'max_f1_va_stdev', 'kappa_tr_stdev',
       'kappa_va_stdev'],
      dtype='object')

In [59]:
# aggregate performance over folds --> get average per task
modeval.aggregate_task_perf(perf_metrics, min_samples=50)
# produce one row for each task, averaged over the folds. 
# performance contains mean metrics + standard deviations.. 
# here obviousely since toy data is only one fold run, it does not do much.

Fold runs found :
 hp_hidden_sizes  hp_last_dropout  hp_weight_decay  hp_learning_rate  hp_learning_steps  hp_epochs
400              0.2              1e-05            0.001             10                 20           0
Name: fold_va, dtype: object


Unnamed: 0,hp_hidden_sizes,hp_last_dropout,hp_weight_decay,hp_learning_rate,hp_learning_steps,hp_epochs,task,num_pos,num_neg,num_pos_va_mean,...,auc_tr_stdev,auc_va_stdev,auc_pr_tr_stdev,auc_pr_va_stdev,avg_prec_tr_stdev,avg_prec_va_stdev,max_f1_tr_stdev,max_f1_va_stdev,kappa_tr_stdev,kappa_va_stdev
0,400,0.2,1e-05,0.001,10,20,8,1444,1369,324,...,,,,,,,,,,
1,400,0.2,1e-05,0.001,10,20,9,398,2415,98,...,,,,,,,,,,
2,400,0.2,1e-05,0.001,10,20,10,79,2734,22,...,,,,,,,,,,
3,400,0.2,1e-05,0.001,10,20,12,2857,3225,612,...,,,,,,,,,,
4,400,0.2,1e-05,0.001,10,20,13,393,5689,74,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,400,0.2,1e-05,0.001,10,20,3542,6353,5695,1322,...,,,,,,,,,,
1504,400,0.2,1e-05,0.001,10,20,3543,4038,8010,845,...,,,,,,,,,,
1505,400,0.2,1e-05,0.001,10,20,3544,2308,9740,472,...,,,,,,,,,,
1506,400,0.2,1e-05,0.001,10,20,3545,8736,3312,1756,...,,,,,,,,,,
