<a href="https://colab.research.google.com/github/lukekolbe/AL-in-CreditScoring/blob/main/result_processor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prep Runtime

In [2]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [3]:
#!pip freeze

In [4]:
#!pip uninstall matplotlib -y

In [5]:
!pip install pingouin
!pip install scikit_posthocs
#!pip install matplotlib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pingouin
  Downloading pingouin-0.5.1.tar.gz (183 kB)
[K     |████████████████████████████████| 183 kB 7.4 MB/s 
Collecting scipy>=1.7
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
[K     |████████████████████████████████| 38.1 MB 1.1 MB/s 
Collecting statsmodels>=0.13
  Downloading statsmodels-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
[K     |████████████████████████████████| 9.8 MB 36.9 MB/s 
Collecting pandas_flavor>=0.2.0
  Downloading pandas_flavor-0.3.0-py3-none-any.whl (6.3 kB)
Collecting outdated
  Downloading outdated-0.2.1-py3-none-any.whl (7.5 kB)
Collecting pandas_flavor>=0.2.0
  Downloading pandas_flavor-0.2.0-py2.py3-none-any.whl (6.6 kB)
Collecting littleutils
  Downloading littleutils-0.2.2.tar.gz (6.6 kB)
Building wheels for collected packages: pingouin, littleutils
  Building wh

In [6]:
############ LIBRARIES

import os
import time
import datetime
import random
import pickle
import re
import copy
import gc
import sys
import json

gc.enable()

import warnings
warnings.filterwarnings('ignore')

import pingouin as pg
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
plt.style.use('default')
%matplotlib inline
import seaborn as sns

import scipy.stats
from collections import namedtuple
import scikit_posthocs as sp

In [7]:
############ RANDOMNESS
# seed function
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
# set seed
seed = 30
seed_everything(seed)

In [8]:
os.chdir('/gdrive/My Drive/ACTIVE LEARNING THESIS/')

# Helper Functions

In [9]:

'''
  This function is an enhanced version of the Friedman Test found in the popular scipy package.
  The added functionality computes z-values (pivotal quantities) that are needed to run the Holm-Test, i.e.
  pairwise comparisons of each model to the control model, with p-value adjustment.
  The formula for the z-statistic can be found in DEMSAR (2006).
'''

#FriedmanchisquareResult = namedtuple('FriedmanchisquareResult',
#                                     ('statistic', 'pvalue'))

def friedmanchisquare2(*args):
    """Compute the Friedman test for repeated measurements.
    The Friedman test tests the null hypothesis that repeated measurements of
    the same individuals have the same distribution.  It is often used
    to test for consistency among measurements obtained in different ways.
    For example, if two measurement techniques are used on the same set of
    individuals, the Friedman test can be used to determine if the two
    measurement techniques are consistent.
    Parameters
    ----------
    measurements1, measurements2, measurements3... : array_like
        Arrays of measurements.  All of the arrays must have the same number
        of elements.  At least 3 sets of measurements must be given.
    Returns
    -------
    statistic : float
        The test statistic, correcting for ties.
    pvalue : float
        The associated p-value assuming that the test statistic has a chi
        squared distribution.
    
    Notes
    -----
    Due to the assumption that the test statistic has a chi squared
    distribution, the p-value is only reliable for n > 10 and more than
    6 repeated measurements.
    References
    ----------
    .. [1] https://en.wikipedia.org/wiki/Friedman_test
    """

    k = len(args)
    if k < 3:
        raise ValueError('At least 3 sets of measurements must be given '
                         'for Friedman test, got {}.'.format(k))

    n = len(args[0])
    for i in range(1, k):
        if len(args[i]) != n:
            raise ValueError('Unequal N in friedmanchisquare.  Aborting.')

    # Rank data
    data = np.vstack(args).T
    data = data.astype(float)
    for i in range(len(data)):
        data[i] = scipy.stats.rankdata(data[i])

    # Handle ties
    ties = 0
    for d in data:
        replist, repnum = scipy.stats.find_repeats(np.array(d))
        for t in repnum:
            ties += t * (t*t - 1)
    c = 1 - ties / (k*(k*k - 1)*n)
    

    ssbn = np.sum(data.sum(axis=0)**2)
    chisq = (12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)) / c #test statistic
    pvalue = scipy.stats.distributions.chi2.sf(chisq, k - 1) #p-value

    ########## ADDED CUSTOM FUNCTIONALITY TO RUN HOLM TEST ##########
    #print('data from function', data)
    rankings_avg = [np.mean([case[j] for case in data]) for j in range(k)]
    rankings_cmp = [r/np.sqrt(k*(k+1)/(6.*n)) for r in rankings_avg] #test statistics (z-value) for each method (DEMSAR 2006)
    ##########

    return chisq, pvalue, data, rankings_avg, rankings_cmp

In [10]:
# ALTERNATIVE VERSION OF FRIEDMAN TEST
# APPLIES IMAN-DAVENPORT CORRECTION TO P-VALUES

def friedman_test(*args):

    """
        Performs a Friedman ranking test.
        Tests the hypothesis that in a set of k dependent samples groups (where k >= 2) at least two of the groups represent populations with different median values.
        
        Parameters
        ----------
        sample1, sample2, ... : array_like
            The sample measurements for each group.
            
        Returns
        -------
        F-value : float
            The computed F-value of the test.
        p-value : float
            The associated p-value from the F-distribution.
        rankings : array_like
            The ranking for each group.
        pivots : array_like
            The pivotal quantities for each group.
            
        References
        ----------
        M. Friedman, The use of ranks to avoid the assumption of normality implicit in the analysis of variance, Journal of the American Statistical Association 32 (1937) 674–701.
        D.J. Sheskin, Handbook of parametric and nonparametric statistical procedures. crc Press, 2003, Test 25: The Friedman Two-Way Analysis of Variance by Ranks
    """

    k = len(args)
    if k < 2: raise ValueError('Less than 2 levels')
    n = len(args[0])
    if len(set([len(v) for v in args])) != 1: raise ValueError('Unequal number of samples')

    rankings = []
    for i in range(n):
        row = [col[i] for col in args]
        row_sort = sorted(row)
        rankings.append([row_sort.index(v) + 1 + (row_sort.count(v)-1)/2. for v in row])

    rankings_avg = [np.mean([case[j] for case in rankings]) for j in range(k)]
    rankings_cmp = [r/np.sqrt(k*(k+1)/(6.*n)) for r in rankings_avg] #see demsar (2006)

    chi2 = ((12*n)/float((k*(k+1))))*((sum(r**2 for r in rankings_avg))-((k*(k+1)**2)/float(4))) #(uncorrected) chi-square test statistic
    iman_davenport = ((n-1)*chi2)/float((n*(k-1)-chi2)) #corrected test-statistic

    p_value = 1 - scipy.stats.f.cdf(iman_davenport, k-1, (k-1)*(n-1)) #iman-davenport corrected p-value
    
    ###########
    p_value_uncorrected = scipy.stats.distributions.chi2.sf(chi2, k - 1)
    ###########

    return iman_davenport,p_value, chi2, p_value_uncorrected, rankings, rankings_avg, rankings_cmp

In [11]:
def finner_test(pivots, control=None):
    """
        Performs a Finner post-hoc test using the pivot quantities obtained by a ranking test.
        Tests the hypothesis that the ranking of the control method is different to each of the other methods.
        
        Parameters
        ----------
        pivots : dictionary_like
            A dictionary with format 'groupname':'pivotal quantity' 
        control : string optional
            The name of the control method,  default the group with minimum ranking
            
        Returns
        ----------
        comparions : array-like
            Strings identifier of each comparison with format 'group_i vs group_j'
        Z-values : array-like
            The computed Z-value statistic for each comparison.
        p-values : array-like
            The associated p-value from the Z-distribution wich depends on the index of the comparison
        adj_p-values : array-like
            The associated adjusted p-values wich can be compared with a significance level
            
        References
        ----------
        H. Finner, On a monotonicity problem in step-down multiple test procedures, Journal of the American Statistical Association 88 (1993) 920–923.
    """
    
    k = len(pivots)
    values = list(pivots.values())
    keys = list(pivots.keys())
    if not control :
        control_i = values.index(min(values))
    else:
        control_i = keys.index(control)

    comparisons = [keys[control_i] + " vs " + keys[i] for i in range(k) if i != control_i]
    z_values = [abs(values[control_i] - values[i]) for i in range(k) if i != control_i]
    p_values = [2*(1-scipy.stats.norm.cdf(abs(z))) for z in z_values]
    # Sort values by p_value so that p_0 < p_1
    p_values, z_values, comparisons= map(list, zip(*sorted(zip(p_values, z_values, comparisons), key=lambda t: t[0])))
    adj_p_values = [min(max(1-(1-p_values[j])**((k-1)/float(j+1)) for j in range(i+1)), 1) for i in range(k-1)]
    
    return comparisons, z_values, p_values, adj_p_values

In [12]:
import itertools as it

def holm_test(pivots, control=None):
    """
        Performs a Holm post-hoc test using the pivot quantities obtained by a ranking test.
        Tests the hypothesis that the ranking of the control method is different to each of the other methods.
        
        Parameters
        ----------
        pivots : dictionary_like
            A dictionary with format 'groupname':'pivotal quantity' 
        control : string optional
            The name of the control method (one vs all), default None (all vs all) 
            
        Returns
        ----------
        comparions : array-like
            Strings identifier of each comparison with format 'group_i vs group_j'
        Z-values : array-like
            The computed Z-value statistic for each comparison.
        p-values : array-like
            The associated p-value from the Z-distribution wich depends on the index of the comparison
        adj_p-values : array-like
            The associated adjusted p-values wich can be compared with a significance level
            
        References
        ----------
        O.J. S. Holm, A simple sequentially rejective multiple test procedure, Scandinavian Journal of Statistics 6 (1979) 65–70.
    """
    k = len(pivots.keys())
    values = list(pivots.values())
    keys = list(pivots.keys())
    if not control :
        control_i = values.index(min(values))
    else:
        control_i = keys.index(control)

    comparisons = [keys[control_i] + " vs " + keys[i] for i in range(k) if i != control_i]
    z_values = [abs(values[control_i] - values[i]) for i in range(k) if i != control_i]
    p_values = [2*(1-scipy.stats.norm.cdf(abs(z))) for z in z_values]
    
    # Sort values by p_value so that p_0 < p_1
    p_values, z_values, comparisons = map(list, zip(*sorted(zip(p_values, z_values, comparisons), key=lambda t: t[0])))
    adj_p_values = [min(max((k-(j+1))*p_values[j] for j in range(i+1)), 1) for i in range(k-1)]
    
    return comparisons, z_values, p_values, adj_p_values

In [13]:
def bonferroni_dunn_test(pivots, control=None):
    """
        Performs a Bonferroni-Dunn post-hoc test using the pivot quantities obtained by a ranking test.
        Tests the hypothesis that the ranking of the control method is different to each of the other methods.
        
        Parameters
        ----------
        pivots : dictionary_like
            A dictionary with format 'groupname':'pivotal quantity' 
        control : string optional
            The name of the control method (one vs all), default None (all vs all) 
            
        Returns
        ----------
        comparions : array-like
            Strings identifier of each comparison with format 'group_i vs group_j'
        Z-values : array-like
            The computed Z-value statistic for each comparison.
        p-values : array-like
            The associated p-value from the Z-distribution wich depends on the index of the comparison
        adj_p-values : array-like
            The associated adjusted p-values wich can be compared with a significance level
            
        References
        ----------
        O.J. Dunn, Multiple comparisons among means, Journal of the American Statistical Association 56 (1961) 52–64.
    """
    k = len(pivots)
    values = list(pivots.values())
    keys = list(pivots.keys())
    if not control :
        control_i = values.index(min(values))
    else:
        control_i = keys.index(control)

    comparisons = [keys[control_i] + " vs " + keys[i] for i in range(k) if i != control_i]
    z_values = [abs(values[control_i] - values[i]) for i in range(k) if i != control_i]
    p_values = [2*(1-scipy.stats.norm.cdf(abs(z))) for z in z_values]
    # Sort values by p_value so that p_0 < p_1
    p_values, z_values, comparisons = map(list, zip(*sorted(zip(p_values, z_values, comparisons), key=lambda t: t[0])))
    adj_p_values = [min((k-1)*p_value,1) for p_value in p_values]
    
    return comparisons, z_values, p_values, adj_p_values

In [14]:
# Helper functions for performing the statistical tests
def generate_scores(method, method_args, data, labels):
    pairwise_scores = method(data, **method_args) # Matrix for all pairwise comaprisons
    pairwise_scores.set_axis(labels, axis='columns', inplace=True) # Label the cols
    pairwise_scores.set_axis(labels, axis='rows', inplace=True) # Label the rows, note: same label as pairwise combinations
    return pairwise_scores

def pairwise_plotter(scores):
    # Pretty plot of significance
    heatmap_args = {'linewidths': 0.25, 'linecolor': '0.5', 'square': True,
                    'cbar_ax_bbox': [0.80, 0.35, 0.04, 0.3]}

    sp.sign_plot(scores, **heatmap_args)

# Load results as saved during computation

## SCORE METRICS

In [15]:
#load result files as specified, store in one large dictionary
dataset_list = ["bene2", "bene1_nobins","gmsc", "uk", "lendingclub", "hmeq", "australian", "german", "thomas", "pakdd"]
ratios_list = [0.1,0.2,0.3,0.4,0.5] #,0.2,0.3,0.4,0.5
cost_mat_list = [None] #only one cost setup was used, others are discarded due to unreliable results and computation issues
weights_list = ['wF', 'wT']
results_dict = {'wT': {}, 'wF': {}} #load results for parameter weights True/False separately

for w in weights_list:
  for d in dataset_list:
    results_dict[w][d] = {}
    for r in ratios_list:
      for c in cost_mat_list:
        if w == 'wT':
          filename = f"{d}__rounds-5_max_gens-10_AL_acc_rate-{r}_weights-True_cost_matrix-{c}_do_thres-tuned"
        else:
          filename = f"{d}__rounds-5_max_gens-10_AL_acc_rate-{r}_weights-False_cost_matrix-{c}_do_thres-tuned"
          
        filename = filename.replace('.', '-')
        filename = filename.replace('[', '')
        filename = filename.replace(']', '')
        filename = filename.replace(',', '-')
        filename = filename.replace(' ', '')

        infile = open(f'model_results/{d}/{filename}','rb')

        rname = f'r{r}'.replace('.', '')

        results_dict[w][d][rname] = pickle.load(infile)

        infile.close()
        #if fname not in configs_list:
        #  configs_list.append(fname)

#print(configs_list)

In [30]:
results_dict['wT']['uk']['r04']['score']['average']

Unnamed: 0,generation,n_accepts,n_rejects,bad_ratio_accepts,bad_ratio_rejects,n_score_accepts,n_AL_selects,bad_ratio_score_accepts,bad_ratio_AL_selects,roc_auc,brier,h-measure,average_precision,balanced_accuracy,pcc,precision,recall,f1,fpr,fnr,tn,fp,fn,tp
0,0.0,,,0.0412,0.03976,,,,,0.665934,0.038477,0.098402,0.080238,0.616615,0.489425,0.065288,0.59,0.117221,0.356771,0.41,3705.0,2055.0,98.4,141.6
1,1.0,827.0,173.0,0.03516,0.0685,196.4,130.6,0.01576,0.04026,0.688893,0.038319,0.118638,0.088162,0.634705,0.456539,0.066319,0.658333,0.120363,0.388924,0.341667,3519.8,2240.2,82.0,158.0
2,2.0,1127.4,372.6,0.03182,0.06744,376.8,250.6,0.01516,0.0375,0.693977,0.038338,0.124531,0.089314,0.645104,0.453389,0.06806,0.683333,0.123653,0.393125,0.316667,3495.6,2264.4,76.0,164.0
3,3.0,1427.0,573.0,0.03014,0.0661,556.6,370.4,0.01412,0.03874,0.704343,0.038293,0.134472,0.093751,0.64816,0.447494,0.068082,0.695,0.123939,0.398681,0.305,3463.6,2296.4,73.2,166.8
4,4.0,1722.4,777.6,0.0275,0.06902,734.0,488.4,0.01284,0.03506,0.707413,0.038233,0.139494,0.095411,0.65276,0.460118,0.069799,0.691667,0.12674,0.386146,0.308333,3535.8,2224.2,74.0,166.0
5,5.0,2021.0,979.0,0.02592,0.0703,913.2,607.8,0.01182,0.0342,0.714296,0.038214,0.145548,0.097769,0.656736,0.463674,0.070781,0.696667,0.128417,0.383194,0.303333,3552.8,2207.2,72.8,167.2
6,6.0,2314.2,1185.8,0.02594,0.06846,1089.2,725.0,0.01194,0.03618,0.715932,0.038256,0.149107,0.098536,0.661076,0.455899,0.070896,0.713333,0.1289,0.391181,0.286667,3506.8,2253.2,68.8,171.2
7,7.0,2607.2,1392.8,0.02498,0.06922,1265.2,842.0,0.0112,0.03582,0.719342,0.038188,0.151049,0.099975,0.66474,0.457553,0.071841,0.72,0.130504,0.390521,0.28,3510.6,2249.4,67.2,172.8
8,8.0,2900.6,1599.4,0.02442,0.06916,1441.4,959.2,0.01092,0.03574,0.720463,0.038262,0.152377,0.099732,0.66217,0.453565,0.071047,0.718333,0.129201,0.393993,0.281667,3490.6,2269.4,67.6,172.4
9,9.0,3194.0,1806.0,0.02398,0.06918,1617.6,1076.4,0.01084,0.03554,0.724194,0.038173,0.156084,0.100627,0.666962,0.456082,0.072116,0.725833,0.131079,0.39191,0.274167,3502.6,2257.4,65.8,174.2


## COST METRICS

In [16]:
#load result files as specified, store in one large dictionary
dataset_list = ["bene2", "bene1_nobins","gmsc", "uk", "lendingclub", "hmeq", "australian", "german", "thomas", "pakdd"]
cost_dict = {'wT': {}, 'wF': {}} #load results for parameter weights True/False separately
#configs_list = []

for w in weights_list:
  for d in dataset_list:
    cost_dict[w][d] = {}
    for r in ratios_list:
      for c in cost_mat_list:
        if w == 'wT':
          filename = f"{d}__rounds-5_max_gens-10_AL_acc_rate-{r}_weights-True_cost_matrix-{c}_do_thres-tuned"
        else:
          filename = f"{d}__rounds-5_max_gens-10_AL_acc_rate-{r}_weights-False_cost_matrix-{c}_do_thres-tuned"
          
        filename = filename.replace('.', '-')
        filename = filename.replace('[', '')
        filename = filename.replace(']', '')
        filename = filename.replace(',', '-')
        filename = filename.replace(' ', '')

        infile = open(f'model_cost/{d}/{filename}','rb')

        rname = f'r{r}'.replace('.', '')

        cost_dict[w][d][rname] = pickle.load(infile)

        infile.close()
        #if fname not in configs_list:
        #  configs_list.append(fname)

#print(configs_list)

In [None]:
cost_dict['wT']['bene2']['r05']['bmdr']['average']

Unnamed: 0,generation,gen_internal_cost,gen_internal_cpl,total_internal_cost,total_internal_cpl,model_internal_cost,model_internal_cpl,external_cost,external_cpl
0,0.0,340.2,0.6804,340.2,0.6804,0.0,,635.2,0.441725
1,1.0,285.533333,0.571067,625.733333,0.625733,285.533333,0.571067,611.133333,0.424988
2,2.0,271.466667,0.542933,897.2,0.598133,557.0,0.557,599.4,0.416829
3,3.0,285.333333,0.570667,1182.533333,0.591267,842.333333,0.561556,587.266667,0.408391
4,4.0,276.2,0.5524,1458.733333,0.583493,1118.533333,0.559267,587.533333,0.408577
5,5.0,275.0,0.55,1733.733333,0.577911,1393.533333,0.557413,578.133333,0.40204
6,6.0,285.266667,0.570533,2019.0,0.576857,1678.8,0.5596,582.8,0.405285
7,7.0,282.333333,0.564667,2301.333333,0.575333,1961.133333,0.560324,578.6,0.402364
8,8.0,298.333333,0.596667,2599.666667,0.577704,2259.466667,0.564867,582.6,0.405146
9,9.0,292.866667,0.585733,2892.533333,0.578507,2552.333333,0.567185,582.333333,0.404961


## Build table template to fill with aggregated results

In [17]:
# ALT: EVERY PARAMETER COMBINATION AS OWN CLASSIFER
#model_list = ['oracle','score', 'random', 'unc', 'qbc', 'dw', 'cors', 'density', 'eer', 'lal', 'quire', 'bmdr', 'spal']
model_list = ['score', 'random', 'unc', 'qbc', 'dw', 'cors', 'density', 'eer', 'lal', 'quire', 'bmdr', 'spal']
performance_metrics = ['roc_auc', 'brier', 'h-measure', 'pcc', 'balanced_accuracy', 'f1','average_precision', 'precision', 'recall', 
                       'fpr', 'fnr', 'fn', 'fp', 'tp','tn', 'bad_ratio_accepts', 'bad_ratio_rejects', 'bad_ratio_AL_selects', 'bad_ratio_score_accepts'
]
cost_metrics = ['total_internal_cost','total_internal_cpl','external_cost','external_cpl']

# built list of column names
c_names = model_list

#column_names = ['score']
column_names = []

index_names = []

#################################
for m in model_list:
  for r in ["r01","r02","r03","r04","r05"]:
    for w in weights_list:
      column_names.append(f'{m}_{w}_{r}')
  #if m != 'score':
    #column_names.append(f'{m}_r05')
#################################

for d in dataset_list:
  for s in performance_metrics:
    for g in range(10):
      index_names.append(f'{d}_{s}_g{g+1}')
  for c in cost_metrics:
    for g in range(10):
      index_names.append(f'{d}_{c}_g{g+1}')

results_template = pd.DataFrame(columns=column_names, index=index_names)
#results_template

## Fill in results

In [18]:
# fill in values from results data
performance = copy.deepcopy(results_template)

for w in ['wT', 'wF']:
  for r in ["r01","r02","r03","r04","r05"]:
    for s in performance_metrics:
      for d in dataset_list:
        for g in range(10):
          for m in model_list:
            #print('RESULTS',w,r,s,d,g,m)
            performance.at[f'{d}_{s}_g{g+1}', f'{m}_{w}_{r}'] = results_dict[w][d][r][m]['average'][s][g+1]

    for c in cost_metrics:
      for d in dataset_list:
        for g in range(10):
          for m in model_list:
            #print('COST',w,r,c,d,g,m)

            performance.at[f'{d}_{c}_g{g+1}', f'{m}_{w}_{r}'] = cost_dict[w][d][r][m]['average'][c][g+1]


model_list = ['score', 'random', 'unc', 'qbc', 'dw', 'cors', 'density', 'eer', 'lal', 'quire', 'bmdr', 'spal']

namelist=[]
for d in dataset_list:
  for g in range(10):
    namelist.append(f'{d}_total_cost_g{g+1}')
  
data = performance.loc[performance.index.str.contains('total_internal_cost')].values + performance.loc[performance.index.str.contains('external_cost')].values
total_cost = pd.DataFrame(data,index=namelist, columns = performance.columns)

performance = pd.concat((performance, total_cost))



##################################

#for m in model_list:
  #if m == 'score':
    #performance.loc[:,'score'] = performance.filter(regex=f'{m}_').mean(axis=1)
  #else:
    #performance.loc[:,f'{m}_r05'] = performance.filter(regex=f'{m}_.*_r05').mean(axis=1) 
    #no need to differentiate between wT and wF when r = 0.5, since the dynamic (inversely proportional) sample weights will all be 1
    #(no difference between score and AL-selections in terms of sample weights)

'''#remove redundant columns (score model and any wT_r05 or wF_r05) before ranking models in order to not dilute ranks
for m in model_list:
  for w in ['wT', 'wF']:
    if m == 'score':
      for r in ["r01","r02","r03","r04","r05"]:
        performance = performance.drop(f'{m}_{w}_{r}', axis=1)
    else:    
      performance = performance.drop(f'{m}_{w}_r05', axis=1)

#build ranks
performance_ranks = performance.rank(ascending=False, axis="columns")
##invert ranks for metrics where less is better
performance_ranks.loc[performance_ranks.index.str.contains('brier|cost|cpl')] = performance.loc[performance.index.str.contains('brier|cost|cpl')].rank(ascending=True, axis="columns")

# make sure each score column has same values (re-adds columns, all with the same values and ranks)
for m in model_list:
  for w in ['wT', 'wF']:
    if m == 'score':
      for r in ["r01","r02","r03","r04","r05"]:
        performance[f'{m}_{w}_{r}'] = performance.loc[:,f'{m}']
        performance_ranks[f'{m}_{w}_{r}'] = performance_ranks.loc[:,f'{m}']
    else:
      performance[f'{m}_{w}_r05'] = performance.loc[:,f'{m}_r05']
      performance_ranks[f'{m}_{w}_r05'] = performance_ranks.loc[:,f'{m}_r05']'''


#performance = performance.reindex(sorted(performance.columns), axis=1)
#performance_ranks = performance_ranks.reindex(sorted(performance_ranks.columns), axis=1)

#performance_ranks.loc['mean_ranks']=(performance_ranks.loc[~performance_ranks.index.str.contains('internal')].mean())
#performance_ranks.loc['cost_ranks']=(performance_ranks.loc[performance_ranks.index.str.contains('external_cpl')].mean())

'#remove redundant columns (score model and any wT_r05 or wF_r05) before ranking models in order to not dilute ranks\nfor m in model_list:\n  for w in [\'wT\', \'wF\']:\n    if m == \'score\':\n      for r in ["r01","r02","r03","r04","r05"]:\n        performance = performance.drop(f\'{m}_{w}_{r}\', axis=1)\n    else:    \n      performance = performance.drop(f\'{m}_{w}_r05\', axis=1)\n\n#build ranks\nperformance_ranks = performance.rank(ascending=False, axis="columns")\n##invert ranks for metrics where less is better\nperformance_ranks.loc[performance_ranks.index.str.contains(\'brier|cost|cpl\')] = performance.loc[performance.index.str.contains(\'brier|cost|cpl\')].rank(ascending=True, axis="columns")\n\n# make sure each score column has same values (re-adds columns, all with the same values and ranks)\nfor m in model_list:\n  for w in [\'wT\', \'wF\']:\n    if m == \'score\':\n      for r in ["r01","r02","r03","r04","r05"]:\n        performance[f\'{m}_{w}_{r}\'] = performance.loc[:,f\

In [31]:
performance

Unnamed: 0,score_wF_r01,score_wT_r01,score_wF_r02,score_wT_r02,score_wF_r03,score_wT_r03,score_wF_r04,score_wT_r04,score_wF_r05,score_wT_r05,random_wF_r01,random_wT_r01,random_wF_r02,random_wT_r02,random_wF_r03,random_wT_r03,random_wF_r04,random_wT_r04,random_wF_r05,random_wT_r05,unc_wF_r01,unc_wT_r01,unc_wF_r02,unc_wT_r02,unc_wF_r03,unc_wT_r03,unc_wF_r04,unc_wT_r04,unc_wF_r05,unc_wT_r05,qbc_wF_r01,qbc_wT_r01,qbc_wF_r02,qbc_wT_r02,qbc_wF_r03,qbc_wT_r03,qbc_wF_r04,qbc_wT_r04,qbc_wF_r05,qbc_wT_r05,dw_wF_r01,dw_wT_r01,dw_wF_r02,dw_wT_r02,dw_wF_r03,dw_wT_r03,dw_wF_r04,dw_wT_r04,dw_wF_r05,dw_wT_r05,cors_wF_r01,cors_wT_r01,cors_wF_r02,cors_wT_r02,cors_wF_r03,cors_wT_r03,cors_wF_r04,cors_wT_r04,cors_wF_r05,cors_wT_r05,density_wF_r01,density_wT_r01,density_wF_r02,density_wT_r02,density_wF_r03,density_wT_r03,density_wF_r04,density_wT_r04,density_wF_r05,density_wT_r05,eer_wF_r01,eer_wT_r01,eer_wF_r02,eer_wT_r02,eer_wF_r03,eer_wT_r03,eer_wF_r04,eer_wT_r04,eer_wF_r05,eer_wT_r05,lal_wF_r01,lal_wT_r01,lal_wF_r02,lal_wT_r02,lal_wF_r03,lal_wT_r03,lal_wF_r04,lal_wT_r04,lal_wF_r05,lal_wT_r05,quire_wF_r01,quire_wT_r01,quire_wF_r02,quire_wT_r02,quire_wF_r03,quire_wT_r03,quire_wF_r04,quire_wT_r04,quire_wF_r05,quire_wT_r05,bmdr_wF_r01,bmdr_wT_r01,bmdr_wF_r02,bmdr_wT_r02,bmdr_wF_r03,bmdr_wT_r03,bmdr_wF_r04,bmdr_wT_r04,bmdr_wF_r05,bmdr_wT_r05,spal_wF_r01,spal_wT_r01,spal_wF_r02,spal_wT_r02,spal_wF_r03,spal_wT_r03,spal_wF_r04,spal_wT_r04,spal_wF_r05,spal_wT_r05
bene2_roc_auc_g1,0.770134,0.770134,0.770134,0.770134,0.770134,0.770134,0.770134,0.770134,0.770082,0.770082,0.767747,0.75766,0.765351,0.762809,0.765912,0.765291,0.768434,0.770053,0.766735,0.766735,0.767798,0.750652,0.766122,0.758728,0.76472,0.761167,0.7629,0.761408,0.763335,0.763335,0.767436,0.747718,0.768637,0.766165,0.766063,0.765882,0.766648,0.767638,0.76609,0.76609,0.768474,0.755805,0.767421,0.761677,0.768591,0.767251,0.765573,0.765123,0.765673,0.765673,0.768265,0.762062,0.76813,0.768763,0.767404,0.769437,0.766213,0.767252,0.767408,0.767408,0.769048,0.759986,0.768379,0.76644,0.766077,0.766159,0.764996,0.765659,0.766311,0.766311,0.767729,0.759596,0.7666,0.75995,0.765622,0.763754,0.764384,0.764172,0.76599,0.76599,0.767508,0.756126,0.766537,0.75872,0.764037,0.761091,0.763362,0.762768,0.766262,0.766262,0.769937,0.761934,0.770131,0.770659,0.770609,0.772793,0.768863,0.770582,0.769513,0.769513,0.768116,0.757634,0.767051,0.766555,0.766267,0.765886,0.767027,0.767981,0.766488,0.766488,0.769222,0.756252,0.76928,0.767288,0.76818,0.769077,0.767599,0.768919,0.768918,0.768918
bene2_roc_auc_g2,0.773295,0.773295,0.773295,0.773295,0.773295,0.773295,0.773295,0.773295,0.773194,0.773194,0.771066,0.759129,0.771131,0.769283,0.768145,0.767009,0.772394,0.774078,0.769033,0.769033,0.770211,0.750833,0.767061,0.760285,0.766643,0.761857,0.763866,0.762821,0.765062,0.765062,0.772309,0.75276,0.772539,0.772594,0.770982,0.77057,0.771964,0.772542,0.772049,0.772049,0.772434,0.763504,0.772012,0.771408,0.772985,0.773864,0.769669,0.768707,0.769614,0.769614,0.772245,0.767165,0.771718,0.769895,0.771121,0.773035,0.770406,0.770904,0.770023,0.770023,0.772771,0.76309,0.771735,0.769427,0.771198,0.770368,0.769735,0.770878,0.771917,0.771917,0.772385,0.753582,0.76818,0.761215,0.767141,0.763205,0.767551,0.765734,0.766596,0.766596,0.770868,0.753799,0.770842,0.764438,0.767622,0.763372,0.766561,0.762803,0.766023,0.766023,0.774691,0.768026,0.775642,0.778091,0.775274,0.777268,0.77416,0.775654,0.775512,0.775512,0.772747,0.763813,0.773589,0.771881,0.771162,0.770602,0.770561,0.771204,0.771724,0.771724,0.773091,0.760901,0.773504,0.773841,0.773904,0.775625,0.773597,0.775472,0.774827,0.774827
bene2_roc_auc_g3,0.774968,0.774968,0.774997,0.774997,0.774968,0.774968,0.774997,0.774997,0.774919,0.774919,0.775039,0.759609,0.775088,0.770805,0.772686,0.7733,0.775547,0.776794,0.772843,0.772843,0.773135,0.755188,0.77124,0.76341,0.771912,0.768676,0.767823,0.767843,0.770226,0.770226,0.777311,0.762228,0.777784,0.776728,0.775326,0.776882,0.776207,0.777724,0.775286,0.775286,0.775666,0.768734,0.775362,0.774964,0.776215,0.776508,0.771985,0.771022,0.77281,0.77281,0.774135,0.769751,0.774587,0.774568,0.773756,0.775027,0.773762,0.774993,0.774419,0.774419,0.774936,0.7643,0.774752,0.771647,0.775571,0.774335,0.775191,0.775972,0.776629,0.776629,0.776856,0.756038,0.77145,0.769819,0.771012,0.769093,0.771687,0.769793,0.771692,0.771692,0.773511,0.757719,0.773899,0.771775,0.772305,0.768377,0.771516,0.767873,0.771192,0.771192,0.778368,0.768222,0.779612,0.780107,0.779302,0.781024,0.777917,0.779375,0.778633,0.778633,0.77666,0.766637,0.776334,0.772368,0.775011,0.774499,0.774692,0.774489,0.774873,0.774873,0.777249,0.772406,0.778373,0.779047,0.778338,0.779853,0.777401,0.779112,0.777921,0.777921
bene2_roc_auc_g4,0.776541,0.776541,0.776572,0.776572,0.776541,0.776541,0.776572,0.776572,0.776527,0.776527,0.777255,0.762208,0.77585,0.774743,0.776291,0.777898,0.776988,0.776804,0.775395,0.775395,0.774573,0.757333,0.773001,0.767688,0.773502,0.770496,0.77096,0.770662,0.773021,0.773021,0.778015,0.766959,0.77771,0.778038,0.775788,0.778197,0.777231,0.779742,0.777718,0.777718,0.777304,0.772593,0.777198,0.777038,0.777221,0.776999,0.774787,0.773833,0.775022,0.775022,0.776146,0.772871,0.776981,0.777921,0.776025,0.777588,0.776744,0.778676,0.777582,0.777582,0.77573,0.766053,0.774975,0.773407,0.77596,0.776715,0.776654,0.777604,0.77751,0.77751,0.777848,0.757188,0.772713,0.773283,0.772191,0.77005,0.772507,0.770707,0.77277,0.77277,0.77541,0.766974,0.774997,0.77377,0.77461,0.774579,0.774673,0.772608,0.773299,0.773299,0.778291,0.765996,0.778869,0.778219,0.779095,0.779923,0.778266,0.779612,0.779869,0.779869,0.777317,0.765155,0.777693,0.772637,0.775922,0.776318,0.776201,0.775966,0.776164,0.776164,0.77742,0.77164,0.778835,0.778399,0.779206,0.780459,0.778877,0.780487,0.77984,0.77984
bene2_roc_auc_g5,0.779402,0.779402,0.779517,0.779517,0.779402,0.779402,0.779517,0.779517,0.779528,0.779528,0.780345,0.771234,0.778946,0.777052,0.779984,0.78197,0.780194,0.778562,0.777919,0.777919,0.777172,0.760331,0.775314,0.769375,0.77433,0.772312,0.772567,0.772572,0.774509,0.774509,0.781894,0.768694,0.781103,0.780991,0.778844,0.781535,0.780567,0.78283,0.780545,0.780545,0.779583,0.773427,0.779412,0.779922,0.778637,0.778918,0.77602,0.775468,0.776781,0.776781,0.779139,0.773103,0.780066,0.779628,0.778441,0.7792,0.779513,0.780818,0.779564,0.779564,0.778156,0.769277,0.777624,0.773825,0.778365,0.778345,0.779523,0.780037,0.779608,0.779608,0.779353,0.760051,0.774769,0.77477,0.77405,0.773717,0.774208,0.772567,0.774089,0.774089,0.778254,0.768548,0.77878,0.775732,0.77677,0.777587,0.7787,0.776344,0.775686,0.775686,0.780821,0.767944,0.780936,0.780045,0.782227,0.783053,0.781615,0.783002,0.782529,0.782529,0.780599,0.769543,0.781019,0.778707,0.779402,0.779733,0.778587,0.77882,0.779108,0.779108,0.779826,0.771819,0.780396,0.778879,0.782311,0.783386,0.782203,0.784158,0.783343,0.783343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pakdd_total_cost_g6,8670.512537,8670.512537,8668.447036,8668.447036,8673.078476,8673.078476,8668.847036,8668.847036,8685.452281,8685.452281,8995.493275,9054.394893,8960.15528,8967.212284,8720.751906,8744.381689,8725.587777,8801.392179,9014.607729,9014.607729,8747.867203,9151.650993,8763.486496,9000.078874,8776.713626,8910.043034,8843.548562,8890.812376,8870.134637,8870.134637,8703.641822,9079.167012,8658.379879,8922.47435,8774.75281,8828.882156,9032.7658,8786.714439,8751.644598,8751.644598,8700.195583,9070.125205,8737.807883,8985.211625,8766.128456,8897.253493,8842.885684,8860.953869,8896.529361,8896.529361,8659.502109,8943.484871,8696.820029,8857.529361,8690.916126,8749.849843,8746.544099,8785.825335,8745.247159,8745.247159,8683.699456,8977.291772,8747.609508,8844.689932,8783.109946,8814.206978,8810.389464,8831.917905,8822.000859,8822.000859,8734.687309,9085.614186,8759.199141,9042.210628,8780.69352,8967.801886,9051.200859,9098.192148,8912.564328,8912.564328,8722.095085,9029.763515,8726.309102,8869.452496,8708.256744,8831.435013,9044.124745,8802.233264,8826.821839,8826.821839,8716.137014,8856.034169,8689.562365,8828.103075,8725.246223,8792.089088,8751.303888,8772.17379,8803.859367,8803.859367,8669.309946,9114.798328,8732.051438,8884.499609,8755.56761,8801.04107,8806.765018,8853.903044,8853.511755,8853.511755,8703.10033,8920.961675,8715.65406,8894.231485,8720.41391,8770.455026,8744.119592,8953.474979,8760.990369,8760.990369
pakdd_total_cost_g7,8947.294211,8947.294211,8964.868016,8964.868016,8951.427835,8951.427835,8963.901204,8963.901204,8980.665394,8980.665394,9077.383314,9299.251714,9038.313473,9294.328456,9063.520497,9292.50217,9269.830488,9113.747657,9365.529147,9365.529147,9059.69168,9448.541001,9110.741538,9318.049214,9146.282218,9220.875378,9213.725021,9234.595829,9242.518089,9242.518089,9241.787624,9352.228173,9109.433111,9252.812407,9075.288214,9148.351185,9392.540511,9102.896986,9280.842757,9280.842757,9012.821808,9411.152335,9080.698735,9348.671912,9102.691803,9208.065762,9200.358209,9191.26133,9248.939008,9248.939008,8977.526585,9280.751277,9014.283813,9144.714531,9014.869857,9120.267326,9071.931892,9110.106572,9063.669013,9063.669013,8991.691619,9294.942443,9065.999049,9151.988651,9089.904731,9146.27251,9158.159804,9189.953746,9145.094333,9145.094333,9043.842788,9369.441285,9102.369481,9413.336416,9116.398359,9331.338164,9188.774818,9215.333793,9233.536416,9233.536416,9018.31082,9331.291051,9032.317874,9155.140756,9056.267265,9169.199233,9167.444536,9380.783376,9136.765954,9136.765954,9007.495115,9142.535511,9032.827897,9131.331048,9037.073323,9084.724055,9081.89168,9078.102201,9109.620965,9109.620965,8988.171107,9307.388866,9276.059305,9230.800951,9103.836638,9138.662925,9116.428801,9129.095207,9168.979035,9168.979035,9011.527429,9207.033326,9047.850594,9169.739821,9063.103014,9111.229737,9095.754559,9082.436638,9119.427958,9119.427958
pakdd_total_cost_g8,9231.717813,9231.717813,9231.222621,9231.222621,9226.116065,9226.116065,9231.422621,9231.422621,9252.722184,9252.722184,9439.73795,9601.571383,9374.991243,9567.905789,9366.780784,9573.713596,9614.37376,9460.484809,9516.153715,9516.153715,9348.329269,9740.076405,9417.394832,9629.783744,9423.894456,9576.614278,9527.555617,9607.221179,9713.191427,9713.191427,9334.836638,9669.704194,9456.055901,9548.051744,9361.30832,9501.159957,9480.35462,9436.280439,9547.414907,9547.414907,9336.793896,9738.164604,9364.08915,9645.988114,9437.713657,9541.099854,9510.380063,9561.377532,9582.035105,9582.035105,9286.953685,9580.047374,9327.66636,9476.242067,9338.157212,9449.702293,9386.958117,9454.276911,9402.001794,9402.001794,9291.958025,9605.172165,9378.026148,9490.74413,9409.893398,9476.763362,9504.12315,9493.769481,9449.583437,9449.583437,9348.536263,9688.499824,9433.031516,9827.970478,9441.955586,9636.536447,9506.146032,9522.431639,9580.945188,9580.945188,9330.181597,9633.450027,9334.738854,9478.718089,9332.119684,9443.063891,9499.474258,9452.942474,9688.801733,9688.801733,9292.794709,9500.488367,9314.530113,9474.814063,9355.147596,9427.873445,9426.227084,9417.572978,9462.366858,9462.366858,9288.414815,9601.723426,9379.85587,9516.850778,9386.482095,9459.347688,9462.653309,9440.380845,9471.558117,9471.558117,9280.086404,9512.576597,9326.917,9453.394862,9355.867234,9439.409693,9405.185155,9416.425366,9459.145909,9459.145909
pakdd_total_cost_g9,9521.408665,9521.408665,9519.779848,9519.779848,9522.841853,9522.841853,9519.779848,9519.779848,9553.743164,9553.743164,9644.97376,9896.448279,9653.682969,9870.551215,9645.482126,9748.502293,9716.765141,9741.146845,9867.091741,9867.091741,9609.352872,10033.629292,9722.930235,9927.180278,9754.968729,9884.41256,9855.989709,9890.793267,9923.575784,9923.575784,9620.997362,9967.09898,9629.550249,9834.126892,9784.166858,9803.256928,9759.152902,9742.834169,9803.285684,9803.285684,9668.182563,10017.869443,9671.883061,9899.452772,9759.151246,9830.736569,9834.186213,9872.378437,10180.186121,10180.186121,9574.283843,9846.213343,9612.067265,9741.505851,9624.756368,9742.959957,9693.435887,9697.27869,9701.34594,9701.34594,9582.951936,9878.965202,9672.289932,9779.737167,9731.468982,9756.031516,9754.925803,9776.006196,9790.164267,9790.164267,9622.9293,10001.219492,9768.03854,10009.042159,9997.77126,9981.350464,9820.442129,9884.639537,9901.520773,9901.520773,9631.204325,9934.415996,9612.435825,9776.51812,9666.010536,9754.500606,9793.708351,9974.686496,9800.634138,9800.634138,9571.892554,9934.642941,9607.892585,9750.679595,9653.420995,9735.122399,9729.400046,9959.085561,9766.065578,9766.065578,9578.008726,9941.009907,9665.081221,9852.49099,9691.848064,9744.471728,9723.943287,9739.547688,9788.428019,9788.428019,9577.416563,9808.177502,9625.807852,9778.079626,9895.371981,9731.581781,9711.221869,9953.263737,9766.749436,9766.749436


In [None]:
'''model_list = ['score', 'random', 'unc', 'qbc', 'dw', 'cors', 'density', 'eer', 'lal', 'quire', 'bmdr', 'spal']

namelist=[]
for d in dataset_list:
  for g in range(10):
    namelist.append(f'{d}_total_cost_g{g+1}')
  
dato = performance.loc[performance.index.str.contains('total_internal_cost')].values + performance.loc[performance.index.str.contains('external_cost')].values
print(np.shape(dato))
total_cost = pd.DataFrame(dato,index=namelist, columns = performance.columns)

performance = pd.concat((performance, total_cost))'''

In [None]:
performance.loc[performance.index.str.contains('bene2_total_internal_cost')]

In [None]:
#performance.filter(regex=f'.*_r05')
performance.filter(regex=f'score')

In [None]:
#performance.loc[performance_ranks.index.str.contains('balance|external_cpl')].T

In [None]:
#testr = performance.filter(regex='r05').filter(regex='spal')
#np.where(testr['spal_wF_r05'] != testr['spal_wT_r05'])

In [None]:
#testr.iloc[213]

In [None]:
performance

# Performance and Ranks (AVERAGE SCORES)

## Prepare Data

In [19]:
"""
  Friedman Test with post-hoc test
"""

#metrics_list = ['roc_auc', 'brier', 'h-measure', 'pcc', 'balanced_accuracy', 'f1','total_internal_cost','total_internal_cpl','external_cost','external_cpl', 'total_cost']
#metrics_list = ['roc_auc', 'brier', 'h-measure', 'pcc', 'balanced_accuracy', 'f1','total_internal_cost','total_internal_cpl','external_cost','external_cpl', 'total_cost']
metrics_list = ['roc_auc', 'brier', 'h-measure', 'pcc', 'f1', 'average_precision', 'precision', 'recall', 'fpr', 'fnr', 'fn','fp','bad_ratio_score_accepts', 'bad_ratio_AL_selects','bad_ratio_accepts', 'bad_ratio_rejects','external_cpl', 'total_internal_cpl']
#metrics_list = ['roc_auc', 'brier', 'h-measure', 'pcc', 'f1', 'external_cpl', 'total_internal_cpl']

dataset_list = ["australian", "bene1_nobins","bene2","german","gmsc", "hmeq","lendingclub", "pakdd", "uk", 'thomas']

#############################################

#BUILD SUBSET OF RESULTS, based on metric and dataset choice
#then create a list of best-performing models, selecting each models' best-performing setup – either overall, or within a specified weight setup

# create list of row names (indices), depending on choice of datasets and metrics
index_names = ''
for i in dataset_list:
  for j in metrics_list:
    index_names += f'{i}_{j}|'
index_names = index_names[:len(index_names)-1] #remove last | symbol
print(index_names)


#store subset of performance and rank dataframes (selected lines only)
performance_sub = performance.loc[performance.index.str.contains(index_names)]
#performance_sub = performance.loc[performance.index.str.contains('g1|g2|g3|g4|g5')]
#performance_sub = performance.loc[performance.index.str.contains('g6|g7|g8|g9|g10')]

###############
# AUGMENT DATA SUBSET for running tests

for m in model_list:
  if m == 'score':
    performance_sub['score'] = performance_sub.filter(regex=f'{m}_').mean(axis=1)
  #if m != 'score':
    #performance_sub.loc[:,f'{m}_r05'] = performance.filter(regex=f'{m}_.*_r05').mean(axis=1) 
    #no need to differentiate between wT and wF when r = 0.5, since the dynamic (inversely proportional) sample weights will all be 1
    #(no difference between score and AL-selections in terms of sample weights)


for m in model_list:
  for w in ['wT', 'wF']:
    if m == 'score':
      for r in ["r01","r02","r03","r04","r05"]:
        performance_sub = performance_sub.drop(f'{m}_{w}_{r}', axis=1)
    #if m != 'score':    
      #performance_sub = performance_sub.drop(f'{m}_{w}_r05', axis=1)

performance_sub = performance_sub.reindex(sorted(performance_sub.columns), axis=1)


###############
# BUILD RANKS after augmentation

performance_sub_ranks = performance_sub.rank(ascending=False, axis="columns")
##invert ranks for metrics where less is better
performance_sub_ranks.loc[~performance_sub_ranks.index.str.contains('roc_auc|h-measure|pcc|f1|average_precision|precision|recall|bad_ratio_rejects')] = performance_sub.loc[performance_sub.index.str.contains('brier|cost|cpl')].rank(ascending=True, axis="columns")
#performance_sub_ranks.loc[performance_sub_ranks.index.str.contains('brier|cost|cpl|bad_ratio_accepts|fnr|fpr|fn|fr|bad_ratio_score_accepts|bad_ratio_AL_selects')] = performance_sub.loc[performance_sub.index.str.contains('brier|cost|cpl')].rank(ascending=True, axis="columns")
#performance_sub_ranks = performance_ranks.loc[performance.index.str.contains(index_names)]

#before picking best-performing models for the overall comparison setup, drop any columns (models) that are redundant


performance_sub_ranks.loc['mean_ranks']=(performance_sub_ranks.loc[~performance_sub_ranks.index.str.contains('internal')].mean())
performance_sub_ranks.loc['cost_ranks']=(performance_sub_ranks.loc[performance_sub_ranks.index.str.contains('external_cpl')].mean())



###############
# get best-performing models, determined by average rank over datasets
# three setups: overall best, best by either weight setup
# option to pick best models either by averaging scores except internal cost, or by just picking external cost

#rank_choice = 'mean_ranks'
rank_choice = 'cost_ranks'

best_performers = {}
namelist = ['score']
#namelist = []
for m in model_list:
  #print(m)
  if m != 'score':
    #print(performance_sub_ranks.filter(regex=f'{m}').loc[rank_choice])
    #print(performance_sub_ranks.filter(regex=f'{m}').loc[rank_choice].idxmin())
    namelist.append(performance_sub_ranks.filter(regex=f'{m}').loc[rank_choice].idxmin())
best_performers['overall'] = namelist

for w in ['wT', 'wF']:
  namelist = ['score']
  #namelist = []
  for m in model_list:
    #print(m)
    if m != 'score':
      #print(performance_sub_ranks.filter(regex=f'{m}_{w}').loc[rank_choice])
      #print(performance_sub_ranks.filter(regex=f'{m}_{w}').loc[rank_choice].idxmin())
      namelist.append(performance_sub_ranks.filter(regex=f'{m}_{w}').loc[rank_choice].idxmin())
  best_performers[w] = namelist

for i in best_performers.keys():
  print(best_performers[i])

performance_sub_ranks

australian_roc_auc|australian_brier|australian_h-measure|australian_pcc|australian_f1|australian_average_precision|australian_precision|australian_recall|australian_fpr|australian_fnr|australian_fn|australian_fp|australian_bad_ratio_score_accepts|australian_bad_ratio_AL_selects|australian_bad_ratio_accepts|australian_bad_ratio_rejects|australian_external_cpl|australian_total_internal_cpl|bene1_nobins_roc_auc|bene1_nobins_brier|bene1_nobins_h-measure|bene1_nobins_pcc|bene1_nobins_f1|bene1_nobins_average_precision|bene1_nobins_precision|bene1_nobins_recall|bene1_nobins_fpr|bene1_nobins_fnr|bene1_nobins_fn|bene1_nobins_fp|bene1_nobins_bad_ratio_score_accepts|bene1_nobins_bad_ratio_AL_selects|bene1_nobins_bad_ratio_accepts|bene1_nobins_bad_ratio_rejects|bene1_nobins_external_cpl|bene1_nobins_total_internal_cpl|bene2_roc_auc|bene2_brier|bene2_h-measure|bene2_pcc|bene2_f1|bene2_average_precision|bene2_precision|bene2_recall|bene2_fpr|bene2_fnr|bene2_fn|bene2_fp|bene2_bad_ratio_score_accepts|

Unnamed: 0,bmdr_wF_r01,bmdr_wF_r02,bmdr_wF_r03,bmdr_wF_r04,bmdr_wF_r05,bmdr_wT_r01,bmdr_wT_r02,bmdr_wT_r03,bmdr_wT_r04,bmdr_wT_r05,cors_wF_r01,cors_wF_r02,cors_wF_r03,cors_wF_r04,cors_wF_r05,cors_wT_r01,cors_wT_r02,cors_wT_r03,cors_wT_r04,cors_wT_r05,density_wF_r01,density_wF_r02,density_wF_r03,density_wF_r04,density_wF_r05,density_wT_r01,density_wT_r02,density_wT_r03,density_wT_r04,density_wT_r05,dw_wF_r01,dw_wF_r02,dw_wF_r03,dw_wF_r04,dw_wF_r05,dw_wT_r01,dw_wT_r02,dw_wT_r03,dw_wT_r04,dw_wT_r05,eer_wF_r01,eer_wF_r02,eer_wF_r03,eer_wF_r04,eer_wF_r05,eer_wT_r01,eer_wT_r02,eer_wT_r03,eer_wT_r04,eer_wT_r05,lal_wF_r01,lal_wF_r02,lal_wF_r03,lal_wF_r04,lal_wF_r05,lal_wT_r01,lal_wT_r02,lal_wT_r03,lal_wT_r04,lal_wT_r05,qbc_wF_r01,qbc_wF_r02,qbc_wF_r03,qbc_wF_r04,qbc_wF_r05,qbc_wT_r01,qbc_wT_r02,qbc_wT_r03,qbc_wT_r04,qbc_wT_r05,quire_wF_r01,quire_wF_r02,quire_wF_r03,quire_wF_r04,quire_wF_r05,quire_wT_r01,quire_wT_r02,quire_wT_r03,quire_wT_r04,quire_wT_r05,random_wF_r01,random_wF_r02,random_wF_r03,random_wF_r04,random_wF_r05,random_wT_r01,random_wT_r02,random_wT_r03,random_wT_r04,random_wT_r05,score,spal_wF_r01,spal_wF_r02,spal_wF_r03,spal_wF_r04,spal_wF_r05,spal_wT_r01,spal_wT_r02,spal_wT_r03,spal_wT_r04,spal_wT_r05,unc_wF_r01,unc_wF_r02,unc_wF_r03,unc_wF_r04,unc_wF_r05,unc_wT_r01,unc_wT_r02,unc_wT_r03,unc_wT_r04,unc_wT_r05
bene2_roc_auc_g1,29.0000,45.000,58.0000,46.000,53.50,106.0000,51.000,72.000,30.000,53.50,26.0000,28.000,41.000,61.000,39.5000,94.000,20.000,11.0000,43.000,39.5000,15.000,25.000,67.0000,82.0000,56.500,100.000,55.0000,63.000,76.0000,56.500,23.000,38.000,22.000,78.0000,74.500,109.000,96.00,44.000,81.000,74.500,33.0000,50.0000,77.000,84.000,69.500,102.0000,101.000,87.0000,85.0000,69.500,36.000,52.0000,86.000,88.000,59.5000,108.000,104.0000,99.0000,93.000,59.5000,37.000,21.0000,68.0000,49.000,65.500,111.000,62.000,73.000,34.000,65.500,8.0000,5.0000,3.000,19.0000,9.500,95.000,2.000,1.0000,4.0000,9.500,32.000,79.0000,71.0000,24.0000,47.5000,105.0000,92.0000,80.000,7.000,47.5000,6.00,13.0000,12.000,27.000,35.000,17.5000,107.000,42.0000,14.0000,16.0000,17.5000,31.0000,64.000,83.0000,91.0000,89.500,110.000,103.0000,98.000,97.000,89.500
bene2_roc_auc_g2,26.0000,19.000,49.0000,60.000,43.50,96.0000,41.000,58.000,47.000,43.50,34.0000,45.000,51.000,61.000,64.5000,81.000,66.000,23.0000,54.000,64.5000,25.000,42.000,48.0000,67.0000,39.500,100.000,71.0000,62.000,55.0000,39.500,30.000,37.000,24.000,68.0000,69.500,97.000,46.00,16.000,75.000,69.500,32.0000,76.0000,82.000,80.000,86.500,109.0000,104.000,99.0000,91.0000,86.500,56.000,57.0000,79.000,88.000,89.5000,108.000,94.0000,98.0000,102.000,89.5000,33.000,29.0000,53.0000,38.000,35.500,110.000,27.000,59.000,28.000,35.500,12.0000,4.0000,9.000,13.0000,6.500,78.000,1.000,2.0000,3.0000,6.500,52.000,50.0000,77.0000,31.0000,73.5000,107.0000,72.0000,84.000,14.000,73.5000,21.00,22.0000,20.000,15.000,18.000,10.5000,105.000,17.0000,5.0000,8.0000,10.5000,63.0000,83.000,85.0000,95.0000,92.500,111.000,106.0000,103.000,101.000,92.500
bene2_roc_auc_g3,26.0000,30.000,45.0000,53.000,50.50,104.0000,74.000,56.000,57.000,50.50,61.0000,54.000,64.000,63.000,58.5000,95.000,55.000,44.0000,46.000,58.5000,49.000,52.000,35.0000,41.0000,27.500,105.000,82.0000,60.000,33.0000,27.500,34.000,37.000,31.000,76.0000,70.500,97.000,48.00,29.000,88.000,70.500,23.0000,84.0000,89.000,81.000,79.500,110.0000,93.000,96.0000,94.0000,79.500,65.000,62.0000,75.000,83.000,86.5000,109.000,78.0000,99.0000,101.000,86.5000,20.000,17.0000,38.0000,32.000,39.500,107.000,25.000,22.000,18.000,39.500,12.0000,4.0000,6.000,16.0000,9.500,100.000,2.000,1.0000,5.0000,9.500,43.000,42.0000,72.0000,36.0000,68.5000,108.0000,90.0000,66.000,24.000,68.5000,47.00,21.0000,11.000,13.000,19.000,14.5000,73.000,8.0000,3.0000,7.0000,14.5000,67.0000,85.000,77.0000,103.0000,91.500,111.000,106.0000,98.000,102.000,91.500
bene2_roc_auc_g4,37.0000,29.000,61.0000,54.000,55.50,108.0000,92.000,52.000,59.000,55.50,57.0000,46.000,58.000,48.000,32.5000,88.000,23.000,31.0000,15.000,32.5000,64.000,71.000,60.0000,50.0000,34.500,106.000,81.0000,49.000,30.0000,34.500,38.000,42.000,41.000,72.0000,68.500,94.000,43.00,44.000,78.000,68.500,25.0000,91.0000,96.000,95.000,89.500,111.0000,84.000,102.0000,99.0000,89.500,65.000,70.0000,75.000,74.000,82.5000,104.000,79.0000,76.0000,93.000,82.5000,22.000,28.0000,63.0000,40.000,26.500,105.000,21.000,20.000,8.000,26.500,17.0000,13.0000,11.000,18.0000,4.500,107.000,19.000,3.0000,9.0000,4.500,39.000,62.0000,53.0000,45.0000,66.5000,109.0000,73.0000,24.000,47.000,66.5000,51.00,36.0000,14.000,10.000,12.000,6.5000,97.000,16.0000,2.0000,1.0000,6.5000,77.0000,87.000,80.0000,98.0000,85.500,110.000,103.0000,101.000,100.000,85.500
bene2_roc_auc_g5,23.0000,18.000,47.0000,62.000,51.50,104.0000,59.000,36.000,57.000,51.50,50.0000,30.000,64.000,44.000,41.5000,97.000,37.000,49.0000,22.000,41.5000,68.000,71.000,65.0000,43.0000,38.500,106.000,94.0000,66.000,32.0000,38.500,40.000,46.000,61.000,79.0000,75.500,96.000,34.00,54.000,83.000,75.500,48.0000,86.0000,93.000,90.000,91.500,111.0000,85.000,95.0000,100.0000,91.500,67.000,58.0000,77.000,60.000,81.5000,108.000,80.0000,72.0000,78.000,81.5000,14.000,17.0000,56.0000,24.000,25.500,107.000,19.000,16.000,7.000,25.500,21.0000,20.0000,11.000,15.0000,8.500,109.000,31.000,5.0000,6.0000,8.500,28.000,53.0000,33.0000,29.0000,69.5000,103.0000,74.0000,13.000,63.000,69.5000,45.00,35.0000,27.000,10.000,12.000,3.5000,102.000,55.0000,2.0000,1.0000,3.5000,73.0000,84.000,89.0000,99.0000,87.500,110.000,105.0000,101.000,98.000,87.500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pakdd_external_cpl_g8,28.0000,57.000,42.0000,73.000,58.50,105.0000,89.000,75.000,60.000,58.50,40.0000,39.000,13.000,36.000,11.5000,102.000,85.000,78.0000,63.000,11.5000,23.000,45.000,15.0000,72.0000,33.500,103.000,74.0000,69.000,71.0000,33.500,27.000,2.000,14.000,32.0000,25.500,110.000,100.00,80.000,70.000,25.500,43.0000,49.0000,21.000,46.000,51.500,107.0000,111.000,87.0000,18.0000,51.500,37.000,22.0000,3.000,53.000,98.5000,106.000,79.0000,66.0000,48.000,98.5000,44.000,81.0000,1.0000,54.000,83.500,108.000,94.000,77.000,20.000,83.500,16.0000,17.0000,24.000,47.0000,55.500,92.000,82.000,61.0000,41.0000,55.500,86.000,50.0000,19.0000,104.0000,64.5000,101.0000,97.0000,95.000,68.000,64.5000,4.00,7.0000,10.000,5.000,6.000,29.5000,93.000,67.0000,62.0000,8.0000,29.5000,35.0000,31.000,9.0000,38.0000,90.500,109.000,96.0000,88.000,76.000,90.500
pakdd_external_cpl_g9,30.0000,53.000,46.0000,50.000,65.50,109.0000,93.000,73.000,59.000,65.50,37.0000,33.000,5.000,44.000,11.5000,94.000,83.000,77.0000,27.000,11.5000,22.000,38.000,43.0000,55.0000,62.500,97.000,75.0000,67.000,71.0000,62.500,56.000,1.000,19.000,26.0000,104.500,108.000,89.00,78.000,64.000,104.500,17.0000,68.0000,100.000,28.000,39.500,106.0000,98.000,87.0000,49.0000,39.500,42.000,9.0000,18.000,34.000,14.5000,103.000,85.0000,72.0000,99.000,14.5000,36.000,7.0000,76.0000,32.000,69.500,107.000,91.000,82.000,4.000,69.500,13.0000,20.0000,23.000,41.0000,51.500,111.000,79.000,74.0000,102.0000,51.500,45.000,31.0000,6.0000,54.0000,80.5000,101.0000,92.0000,61.000,58.000,80.5000,8.00,16.0000,10.000,95.000,2.000,24.5000,90.000,84.0000,57.0000,96.0000,24.5000,3.0000,29.000,21.0000,35.0000,47.500,110.000,88.0000,86.000,60.000,47.500
pakdd_external_cpl_g10,45.0000,59.000,39.0000,55.000,72.50,108.0000,93.000,87.000,44.000,72.50,38.0000,37.000,105.000,56.000,24.5000,96.000,85.000,86.0000,58.000,24.5000,43.000,47.000,42.0000,63.0000,77.500,101.000,81.0000,110.000,80.0000,77.500,4.000,1.000,11.000,3.0000,15.500,104.000,95.00,76.000,67.000,15.500,7.0000,46.0000,26.000,53.000,34.500,106.0000,94.000,91.0000,65.0000,34.500,33.000,21.0000,31.000,54.000,88.5000,107.000,82.0000,70.0000,36.000,88.5000,100.000,28.0000,32.0000,48.000,5.500,109.000,92.000,71.000,13.000,5.500,23.0000,30.0000,49.000,22.0000,40.500,102.000,75.000,74.0000,61.0000,40.500,69.000,50.0000,8.0000,57.0000,83.5000,103.0000,98.0000,66.000,60.000,83.5000,27.00,12.0000,19.000,9.000,10.000,51.5000,97.000,64.0000,68.0000,99.0000,51.5000,2.0000,14.000,20.0000,29.0000,17.500,111.000,90.0000,79.000,62.000,17.500
mean_ranks,53.1995,53.427,52.6785,47.886,49.57,71.0165,61.924,52.087,45.759,49.57,54.6495,57.887,57.445,57.488,54.1265,74.103,65.716,56.6475,53.671,54.1265,54.388,52.877,49.6385,50.2765,49.647,71.342,57.8595,47.611,46.1595,49.647,55.128,52.020,52.735,55.6995,52.606,74.424,63.41,53.170,55.499,52.606,55.1765,53.5765,54.567,57.293,51.843,79.1865,64.736,56.7415,49.5625,51.843,55.585,55.6305,57.766,60.178,56.1745,77.525,63.9965,56.3885,59.998,56.1745,53.226,49.1755,53.5675,47.429,43.982,77.456,57.861,47.995,39.785,43.982,51.4045,50.7335,54.462,54.4745,52.966,73.013,62.763,54.8255,50.1385,52.966,52.855,57.5805,49.7175,53.1325,52.8845,69.3365,57.5735,47.990,43.409,52.8845,55.66,52.1085,55.111,55.749,54.624,55.0165,72.626,69.8145,60.7675,53.7105,55.0165,51.5125,52.643,52.8315,57.2345,53.723,71.496,63.8125,57.221,58.367,53.723


In [None]:
performance_sub[best_performers['overall']]

In [None]:
performance_sub_ranks.filter(regex='bmdr_wT')

## Run Friedman and Post-Hoc tests for BEST PERFORMERS ONLY

In [20]:
#############################################
#RUN TESTS

friedman_results2 = {}
#nemenyi_results = {}
#conover_results = {}
#holm_results = {}
finner_results = {}
#quade_scores = {}
rankings_dict = {}
pval_dict = {}
test_results = {}

friedman_statistics = pd.DataFrame(columns=['statistic', 'pvalue'], index=metrics_list)

for w in ['overall','wT', 'wF']: #,'wT', 'wF'
  print('\n\n\n------------------------\nSETUP:', w)
  print('best performing models:', best_performers[w])
  
  #nemenyi_results[w] = {}
  #conover_results[w] = {}
  #holm_results[w] = {}
  finner_results[w] = {}

  rankings = pd.DataFrame(columns = best_performers[w], index=metrics_list)
  pvals = pd.DataFrame(columns = best_performers[w], index=metrics_list)

  for s in metrics_list:
    metric_values = []

    for d in dataset_list:
      row_values = []

      for m in best_performers[w]:
        row_values.append(performance_sub[f'{m}'].loc[performance_sub.index.str.contains(f'{d}_{s}')].mean()) #mean over 10 generations
        columns = best_performers[w]
      
      metric_values.append(row_values)

    #invert sign of measurements for metrics where more is better
    #this is done because the friedman test uses the rankdata function, which assigns ranks from lowest to highest (low score = low rank)
    if s not in ['brier','total_internal_cost','total_internal_cpl','external_cost','external_cpl', 'total_cost', 'fnr','fpr','fn','fp','bad_ratio_accepts', 'bad_ratio_AL_selects']:
      metric_values = np.array(metric_values) * -1
    
    print('\n\n------------------------\nMETRIC', s)
    print(pd.DataFrame(metric_values, columns = columns))

    #statistic, pvalue,statistic_uncorr, p_uncorr,ranking,ranking_avg, rank_cmp = friedman_test(*np.transpose(metric_values)) #this test applies the Iman-Davenport correction to
    statistic, pvalue, ranking, ranking_avg, rank_cmp = friedmanchisquare2(*np.transpose(metric_values))

    print(columns)
    print("average_ranks", ranking_avg)
    print("pivot", rank_cmp)

    temp = {'statistic':statistic, 'pvalue':pvalue}
    friedman_statistics.loc[f'{s}'] = temp
    rankings.loc[f'{s}'] = ranking_avg
    

    # POST HOC TEST HOLM
    pivot_scores = {key: rank_cmp[i] for i, key in enumerate(columns)} 
    print(pivot_scores)
    #print(pd.DataFrame(np.transpose(pivot_scores), columns = columns))
    
    #get the name of the best-ranking model to use as control
    ## REDUNDAND FEATURE; post-hoc tests automatically compare against lowest-scoring models when no control classifier is given...
    min_idx = pd.DataFrame(np.transpose(list(pivot_scores.values())), index = pivot_scores.keys()).iloc[:,0].idxmin() 
    
    '''comparisons, z_values, p_values, adj_p_values = holm_test(pivot_scores, control=columns[min_rank])
    adj_p_values = np.asarray(adj_p_values)
    holm_scores = pd.DataFrame({"p": adj_p_values, "sig": adj_p_values < 0.05}, index=comparisons)
    #print(holm_scores)
    holm_results[w][s] = copy.deepcopy(holm_scores)'''

    #comparisons, z_values, p_values, adj_p_values, keys = finner_test(pivot_scores, control=columns[min_rank])
    comparisons, z_values, p_values, adj_p_values = finner_test(pivot_scores, control = min_idx)
    adj_p_values = np.asarray(adj_p_values)
    finner_scores = pd.DataFrame({"p": adj_p_values, "sig": adj_p_values < 0.05}, index=comparisons, )
    print(p_values)
    print(adj_p_values)
    print(finner_scores)
    finner_results[w][s] = copy.deepcopy(finner_scores)

    for m in best_performers[w]:
      try:
        pvals.at[s,m] = np.round(finner_scores.loc[finner_scores.index.str.contains(f'vs {m}')]['p'].values, 4)
      except KeyError:
        pvals.at[s,m] = np.nan


    #nemenyi_results[w][s] = generate_scores(sp.posthoc_nemenyi_friedman, {}, metric_values, columns)
    #conover_results[w][s] = generate_scores(sp.posthoc_conover_friedman, {'p_adjust':'holm-sidak'}, metric_values, columns)
    #quade_scores[s] = generate_scores(sp.posthoc_quade, {}, metric_values, columns)

  print(f'Friedman results\n',friedman_statistics,'\n\n')
  friedman_results2[w] = copy.deepcopy(friedman_statistics)
  pvals = pvals.T.add_suffix('_p-values')
  pvals.loc['Friedman'] = np.round(friedman_statistics['pvalue'].to_numpy().flatten().astype(float),4)

  rankings = rankings.T.add_suffix('_avg-rank')
  rankings.loc['Friedman'] = np.round(friedman_statistics['statistic'].to_numpy().flatten().astype(float),4)
  rankings['AvgRank'] = np.round(rankings.mean(axis=1),2)
  rankings['AvgRank'].loc['Friedman'] = np.nan
  rankings['HighScore'] = rankings['AvgRank'].rank()
  rankings_dict[w] = copy.deepcopy(rankings)

  pval_dict[w] = copy.deepcopy(pvals)
  combined_results = pd.concat((rankings, pvals), axis=1)

  test_results[w] = combined_results.reindex(sorted(combined_results.columns), axis=1)







------------------------
SETUP: overall
best performing models: ['score', 'random_wT_r04', 'unc_wF_r01', 'qbc_wT_r04', 'dw_wF_r03', 'cors_wF_r01', 'density_wT_r04', 'eer_wF_r02', 'lal_wF_r02', 'quire_wT_r04', 'bmdr_wT_r04', 'spal_wF_r04']


------------------------
METRIC roc_auc
      score  random_wT_r04  unc_wF_r01  qbc_wT_r04  dw_wF_r03  cors_wF_r01  \
0 -0.918350      -0.915087   -0.916759   -0.920703  -0.919958    -0.916236   
1 -0.748293      -0.753867   -0.747846   -0.745927  -0.743562    -0.750068   
2 -0.778209      -0.777812   -0.776355   -0.779185  -0.777597    -0.777840   
3 -0.773833      -0.776912   -0.777436   -0.777290  -0.779121    -0.774126   
4 -0.839645      -0.843481   -0.841919   -0.843312  -0.843139    -0.842159   
5 -0.780813      -0.783517   -0.780313   -0.784620  -0.782407    -0.778114   
6 -0.620482      -0.625864   -0.625332   -0.627058  -0.622621    -0.620470   
7 -0.594101      -0.590028   -0.591857   -0.594965  -0.596153    -0.594124   
8 -0.711233   

In [21]:
test_results['overall']

Unnamed: 0,AvgRank,HighScore,average_precision_avg-rank,average_precision_p-values,bad_ratio_AL_selects_avg-rank,bad_ratio_AL_selects_p-values,bad_ratio_accepts_avg-rank,bad_ratio_accepts_p-values,bad_ratio_rejects_avg-rank,bad_ratio_rejects_p-values,bad_ratio_score_accepts_avg-rank,bad_ratio_score_accepts_p-values,brier_avg-rank,brier_p-values,external_cpl_avg-rank,external_cpl_p-values,f1_avg-rank,f1_p-values,fn_avg-rank,fn_p-values,fnr_avg-rank,fnr_p-values,fp_avg-rank,fp_p-values,fpr_avg-rank,fpr_p-values,h-measure_avg-rank,h-measure_p-values,pcc_avg-rank,pcc_p-values,precision_avg-rank,precision_p-values,recall_avg-rank,recall_p-values,roc_auc_avg-rank,roc_auc_p-values,total_internal_cpl_avg-rank,total_internal_cpl_p-values
score,6.11,2.0,9.7,[0.0],1.4,[],1.2,[],1.6,[],6.3,[0.0027],10.5,[0.0],7.2,[0.6626],7.7,[0.1541],5.8,[0.4128],5.8,[0.4128],7.6,[0.0906],7.6,[0.0906],7.3,[0.0423],7.8,[0.0191],7.7,[0.2149],5.8,[0.4128],7.6,[0.0677],1.4,[]
random_wT_r04,6.15,3.0,5.2,[0.0662],4.8,[0.0384],8.0,[0.0001],7.7,[0.0004],8.5,[0.0],4.7,[0.1928],5.2,[0.7098],5.2,[0.5694],8.4,[0.0345],8.4,[0.0345],4.7,[0.6989],4.7,[0.6989],4.7,[0.4201],4.2,[0.6116],4.7,[0.9505],8.4,[0.0345],5.4,[0.2834],7.8,[0.0002]
unc_wF_r01,6.27,4.0,8.6,[0.0002],9.9,[0.0],3.5,[0.1678],4.1,[0.1323],1.3,[],7.9,[0.0037],6.4,[0.6626],6.0,[0.3485],6.7,[0.2201],6.7,[0.2201],6.1,[0.2551],6.1,[0.2551],8.7,[0.0111],7.2,[0.0423],5.7,[0.6093],6.7,[0.2201],7.5,[0.0677],3.7,[0.1678]
qbc_wT_r04,5.73,1.0,2.1,[],6.3,[0.0043],9.9,[0.0],9.0,[0.0],8.1,[0.0001],2.6,[],4.6,[],4.2,[],7.0,[0.2108],7.0,[0.2108],5.2,[0.5256],5.2,[0.5256],3.4,[],4.2,[0.6116],4.6,[],7.0,[0.2108],3.5,[],9.3,[0.0]
dw_wF_r03,7.03,11.0,6.0,[0.0244],7.3,[0.0006],7.9,[0.0001],7.7,[0.0004],6.7,[0.0013],4.8,[0.188],6.9,[0.6626],8.7,[0.0563],6.7,[0.2201],6.7,[0.2201],6.8,[0.146],6.8,[0.146],6.9,[0.0542],7.0,[0.0423],7.9,[0.2149],6.7,[0.2201],7.2,[0.0677],7.9,[0.0002]
cors_wF_r01,6.48,7.0,9.9,[0.0],8.9,[0.0],2.9,[0.2917],2.9,[0.4201],2.0,[0.6642],10.0,[0.0],7.3,[0.6626],7.1,[0.1634],6.8,[0.2201],6.8,[0.2201],6.4,[0.2062],6.4,[0.2062],8.3,[0.013],6.9,[0.0423],6.5,[0.3485],6.8,[0.2201],7.9,[0.0677],2.8,[0.3853]
density_wT_r04,6.29,5.0,4.8,[0.094],4.5,[0.0545],7.1,[0.0004],7.0,[0.0011],7.6,[0.0002],5.5,[0.0978],6.7,[0.6626],5.9,[0.3485],8.6,[0.0345],8.6,[0.0345],4.5,[0.7565],4.5,[0.7565],5.4,[0.2336],5.2,[0.2834],5.1,[0.8221],8.6,[0.0345],6.6,[0.0844],7.0,[0.0007]
eer_wF_r02,6.87,10.0,6.3,[0.0201],9.9,[0.0],7.0,[0.0004],7.2,[0.0009],4.5,[0.0518],5.9,[0.0733],7.3,[0.6626],7.2,[0.1634],5.5,[0.4862],5.5,[0.4862],7.4,[0.0906],7.4,[0.0906],7.2,[0.0423],7.2,[0.0423],7.7,[0.2149],5.5,[0.4862],7.8,[0.0677],7.1,[0.0006]
lal_wF_r02,6.85,9.0,8.3,[0.0003],8.2,[0.0001],5.7,[0.0064],5.9,[0.0094],5.1,[0.0225],7.6,[0.0053],6.5,[0.6626],7.7,[0.1541],4.5,[0.8524],4.5,[0.8524],8.5,[0.0286],8.5,[0.0286],7.6,[0.0333],9.1,[0.0035],7.9,[0.2149],4.5,[0.8524],7.2,[0.0677],6.0,[0.0053]
quire_wT_r04,6.54,8.0,5.1,[0.0689],5.6,[0.0126],8.3,[0.0],8.1,[0.0002],10.2,[0.0],5.9,[0.0733],5.8,[0.6626],6.1,[0.3485],4.8,[0.7436],4.8,[0.7436],7.6,[0.0906],7.6,[0.0906],5.5,[0.2303],6.9,[0.0423],6.9,[0.2637],4.8,[0.7436],5.4,[0.2834],8.3,[0.0001]


In [22]:
for m in model_list:
  print(performance_sub_ranks.loc['cost_ranks'].filter(regex=f'{m}').idxmin())

score
random_wT_r04
unc_wF_r01
qbc_wT_r04
dw_wF_r03
cors_wF_r01
density_wT_r04
eer_wF_r02
lal_wF_r02
quire_wT_r04
bmdr_wT_r04
spal_wF_r04


In [None]:
friedman_results2['overall'] #uncorrected pvalues (without thomas)

Unnamed: 0,statistic,pvalue
roc_auc,1.419365,0.176099
brier,5.765948,0.0
h-measure,1.973738,0.038962
pcc,3.346508,0.000595
f1,1.31746,0.226099
average_precision,5.820359,0.0
precision,1.627581,0.102481
recall,2.226448,0.018558
fpr,2.38535,0.011521
fnr,2.226448,0.018558


In [None]:
friedman = np.round(friedman_results2['overall'].to_numpy().flatten().astype(float),4)
friedman

In [None]:
friedman_results2['overall'] #iman davenport pvalues (with thomas)

In [None]:
finner_results['overall']

In [None]:
holm_results['overall']

In [None]:
nemenyi_results['overall']['brier']

In [None]:
'''for k1 in nemenyi_results.keys():
  for k2 in nemenyi_results[k1].keys():
    pairwise_plotter(nemenyi_results[k1][k2])'''

#pairwise_plotter(nemenyi_results['overall']['total_internal_cpl'])
pairwise_plotter(conover_results['overall']['roc_auc'])

In [None]:
sp.sign_plot(nemenyi_results['overall']['brier'])

## Run Friedman Tests for 2x60 models (average SCORES)

In [None]:
"""
  Runs test comparing ALL models for each weight setup
"""


#model_list = ['score', 'random', 'unc', 'qbc', 'dw', 'cors', 'density', 'eer', 'lal', 'quire', 'bmdr', 'spal']
#metrics_list = ['roc_auc', 'brier', 'h-measure', 'pcc', 'balanced_accuracy', 'f1','total_internal_cost','total_internal_cpl','external_cost','external_cpl']
metrics_list = ['roc_auc', 'brier', 'h-measure', 'pcc','balanced_accuracy', 'f1', 'total_internal_cpl','external_cpl']  #'brier', 'h-measure', 'pcc',
dataset_list = ["bene2", "bene1_nobins","gmsc", "uk", "lendingclub", "hmeq", "pakdd", "australian", "german"] #,"thomas", 
#dataset_list = ["bene2", "bene1_nobins","gmsc", "uk", "hmeq", "australian", "german"]
#dataset_list = ["lendingclub", "thomas", "pakdd"]


friedman_results = {'wT':{}, 'wF':{}}
nemenyi_scores = {}
quade_scores = {}

for w in ['wT', 'wF']:
  friedman_results[w] = {}
  friedman_statistics = pd.DataFrame(columns=['statistic', 'pvalue'], index=metrics_list)

  for s in metrics_list:
    metric_values = []

    for d in dataset_list:
      row_values = []
      columns = []

      for m in model_list:
        row_values.extend(performance.filter(regex=f'{m}_{w}').loc[performance.index.str.contains(f'{d}_{s}')].mean().tolist())
        columns.extend(list(performance.filter(regex=f'{m}_{w}').columns))
      
      metric_values.append(row_values)

    #invert sign of measurements for metrics where more is better – needed because friedmanchisquare() uses scipy.stats.rankdata(), which ranks in ascending fashion (low value -> low rank)
    if s not in ['brier','total_internal_cost','total_internal_cpl','external_cost','external_cpl']:
      metric_values = np.array(metric_values) * -1

    print('\n\n\n------------------------\nMETRIC', s)

    #statistic, pvalue,statistic_uncorr, p_uncorr,ranking,ranking_avg, rank_cmp = friedman_test(*np.transpose(metric_values))
    statistic, pvalue, ranking, ranking_avg, rank_cmp = friedmanchisquare2(*np.transpose(metric_values))

    print('\n-------------------------------\nscipy Friedman MODDED\n-------------------------------\n')
    for pr in [statistic, pvalue, ranking_avg, rank_cmp]: #, ranking,ranking_avg, rank_cmp]:
      print(pr)

    temp = {'statistic':statistic, 'pvalue':pvalue}
    friedman_statistics.loc[f'{s}'] = temp



    # POST HOC TEST HOLM
    ranks = {key: rank_cmp[i] for i, key in enumerate(columns)} 
    min_rank = pd.DataFrame(np.transpose(list(ranks.values())), index = ranks.keys()).iloc[:,0].argmin()
    
    comparisons, z_values, p_values, adj_p_values = holm_test(ranks, control=columns[min_rank])
    adj_p_values = np.asarray(adj_p_values)
    holm_scores = pd.DataFrame({"p": adj_p_values, "sig": adj_p_values < 0.05}, index=comparisons)
    print(holm_scores)    


    comparisons, z_values, p_values, adj_p_values = finner_test(pivot_scores, control=columns[min_rank])
    adj_p_values = np.asarray(adj_p_values)
    finner_scores = pd.DataFrame({"p": adj_p_values, "sig": adj_p_values < 0.05}, index=comparisons)
    print(finner_scores)
    #finner_results[w][s] = copy.deepcopy(finner_scores)


    #nemenyi_scores[s] = generate_scores(sp.posthoc_nemenyi_friedman, {}, metric_values, columns)
    #quade_scores[s] = generate_scores(sp.posthoc_quade, {}, metric_values, columns)


  print(np.shape(metric_values))
  print(f'Friedman results for {w}\n',friedman_statistics,'\n\n')
  friedman_results[w][r] = friedman_statistics

'''for key in nemenyi_scores.keys():
  pairwise_plotter(nemenyi_scores[key])'''





------------------------
METRIC roc_auc

-------------------------------
scipy Friedman MODDED
-------------------------------

275.40295307818246
1.1040457182485505e-29
[24.77777777777778, 24.77777777777778, 24.77777777777778, 24.77777777777778, 24.77777777777778, 51.111111111111114, 34.666666666666664, 25.88888888888889, 17.666666666666668, 21.444444444444443, 54.22222222222222, 38.0, 31.88888888888889, 28.77777777777778, 22.88888888888889, 55.111111111111114, 32.111111111111114, 21.555555555555557, 12.88888888888889, 16.0, 55.44444444444444, 42.0, 28.555555555555557, 25.555555555555557, 22.666666666666668, 47.888888888888886, 38.111111111111114, 24.555555555555557, 16.88888888888889, 17.444444444444443, 53.77777777777778, 35.22222222222222, 24.555555555555557, 16.77777777777778, 15.333333333333334, 54.22222222222222, 39.888888888888886, 29.666666666666668, 22.22222222222222, 21.77777777777778, 54.0, 38.111111111111114, 28.666666666666668, 23.88888888888889, 18.444444444444443, 47

ValueError: ignored

In [None]:
columns = []
columns.append(performance.filter(regex='bmdr_wT').columns)
type(columns)

list

In [None]:
 performance.filter(regex=f'{w}_{r}').columns

Index(['bmdr_wF_r05', 'cors_wF_r05', 'density_wF_r05', 'dw_wF_r05',
       'eer_wF_r05', 'lal_wF_r05', 'qbc_wF_r05', 'quire_wF_r05',
       'random_wF_r05', 'score_wF_r05', 'spal_wF_r05', 'unc_wF_r05'],
      dtype='object')

In [None]:
pd.DataFrame(np.transpose(list(ranks.values())), index = ranks.keys())

Unnamed: 0,0
score_wF_r05,2.668498
random_wF_r05,3.780372
unc_wF_r05,3.520934
qbc_wF_r05,3.261497
dw_wF_r05,3.854496
cors_wF_r05,3.632122
density_wF_r05,3.039122
eer_wF_r05,3.15031
lal_wF_r05,4.521621
quire_wF_r05,2.446123


In [None]:
ranks

{'bmdr_wF_r05': 2.816747403312185,
 'cors_wF_r05': 3.632121651639396,
 'density_wF_r05': 3.0391221983105146,
 'dw_wF_r05': 3.8544964466377265,
 'eer_wF_r05': 3.15030959580968,
 'lal_wF_r05': 4.521620831632717,
 'qbc_wF_r05': 3.261496993308845,
 'quire_wF_r05': 2.446122744981634,
 'random_wF_r05': 3.780371514971616,
 'score_wF_r05': 2.6684975399799646,
 'spal_wF_r05': 3.780371514971616,
 'unc_wF_r05': 3.5209342541402306}

In [None]:
friedman_results

{'wF': {'r01':                     statistic    pvalue
  roc_auc             11.725275  0.384644
  brier               21.263736  0.030739
  h-measure            5.241758  0.918895
  pcc                  7.263736  0.777341
  balanced_accuracy   11.021978  0.441424
  f1                   9.461538   0.57937
  total_internal_cpl  40.538462  0.000029
  external_cpl        11.021978  0.441424,
  'r02':                     statistic    pvalue
  roc_auc              7.967033  0.716254
  brier               21.857143  0.025504
  h-measure            6.956044  0.802632
  pcc                  8.868132  0.634063
  balanced_accuracy    9.725275  0.555255
  f1                   5.285714  0.916544
  total_internal_cpl  43.175824   0.00001
  external_cpl         9.725275  0.555255,
  'r03':                     statistic    pvalue
  roc_auc              6.802198  0.814868
  brier               19.769231  0.048608
  h-measure            4.296703   0.96042
  pcc                  9.527473  0.573323
  bal

## run separate tests for each combination of weights and ratio

In [None]:
#Friedman Test with post-hoc test
#model_list = ['score', 'random', 'unc', 'qbc', 'dw', 'cors', 'density', 'eer', 'lal', 'quire', 'bmdr', 'spal']
#metrics_list = ['roc_auc', 'brier', 'h-measure', 'pcc', 'balanced_accuracy', 'f1','total_internal_cost','total_internal_cpl','external_cost','external_cpl']
metrics_list = ['roc_auc', 'brier', 'h-measure', 'pcc','balanced_accuracy', 'f1', 'total_internal_cpl','external_cpl']  #'brier', 'h-measure', 'pcc',
dataset_list = ["bene2", "bene1_nobins","gmsc", "uk", "lendingclub", "hmeq", "pakdd"] #"australian", "german", "thomas", 
#dataset_list = ["bene2", "bene1_nobins","gmsc", "uk", "hmeq", "australian", "german"]
#dataset_list = ["lendingclub", "thomas", "pakdd"]


friedman_results = {'wT':{}, 'wF':{}}
nemenyi_scores = {}
quade_scores = {}

average_ratios = False #this feature does not work correctly, if this is enabled it is unnecessary to loop over ratios. 

for w in ['wT', 'wF']:
  friedman_results[w] = {}
  
  for r in ["r01","r02","r03","r04","r05"]:
    friedman_statistics = pd.DataFrame(columns=['statistic', 'pvalue'], index=metrics_list)

    for s in metrics_list:
      metric_values = []

      for d in dataset_list:
        row_values = []
        if average_ratios:
          columns = model_list
        else:
          columns = []

        for m in model_list:
          if average_ratios:
            row_values.append(performance.filter(regex=f'{m}_{w}').loc[performance.index.str.contains(f'{d}_{s}')].mean().mean())
          else:
            row_values.extend(performance.filter(regex=f'{m}_{w}_{r}').loc[performance.index.str.contains(f'{d}_{s}')].mean().tolist())
            columns.append(f'{m}_{w}_{r}')
        
        metric_values.append(row_values)

      #invert sign of measurements for metrics where more is better – needed because friedmanchisquare() uses scipy.stats.rankdata(), which ranks in ascending fashion (low value -> low rank)
      if s not in ['brier','total_internal_cost','total_internal_cpl','external_cost','external_cpl']:
        metric_values = np.array(metric_values) * -1

      print('\n\n\n------------------------\nMETRIC', s)

      #statistic, pvalue,statistic_uncorr, p_uncorr,ranking,ranking_avg, rank_cmp = friedman_test(*np.transpose(metric_values))
      statistic, pvalue, ranking, ranking_avg, rank_cmp = friedmanchisquare2(*np.transpose(metric_values))

      print('\n-------------------------------\nscipy Friedman MODDED\n-------------------------------\n')
      for pr in [statistic, pvalue, ranking_avg, rank_cmp]: #, ranking,ranking_avg, rank_cmp]:
        print(pr)

      print('\n-------------------------------\nCustom STAC\n-------------------------------\n')
      print(friedman_test(*np.transpose(metric_values)))

      temp = {'statistic':statistic, 'pvalue':pvalue}
      friedman_statistics.loc[f'{s}'] = temp



      # POST HOC TEST HOLM
      ranks = {key: rank_cmp[i] for i, key in enumerate(columns)} 
      print('ranking_avg', ranking_avg)
      print(pd.DataFrame(np.transpose(list(ranks.values())), index = ranks.keys()).iloc[:,0])
      min_rank = pd.DataFrame(np.transpose(list(ranks.values())), index = ranks.keys()).iloc[:,0].argmin()
      
      comparisons, z_values, p_values, adj_p_values = holm_test(ranks, control=columns[min_rank])
      adj_p_values = np.asarray(adj_p_values)
      holm_scores = pd.DataFrame({"p": adj_p_values, "sig": adj_p_values < 0.05}, index=comparisons)
      print(holm_scores)    

      #nemenyi_scores[s] = generate_scores(sp.posthoc_nemenyi_friedman, {}, metric_values, columns)
      #quade_scores[s] = generate_scores(sp.posthoc_quade, {}, metric_values, columns)


    print(np.shape(metric_values))
    print(f'Friedman results for {w}\n',friedman_statistics,'\n\n')
    friedman_results[w][r] = friedman_statistics

'''for key in nemenyi_scores.keys():
  pairwise_plotter(nemenyi_scores[key])'''





------------------------
METRIC roc_auc

-------------------------------
scipy Friedman MODDED
-------------------------------

25.571428571428612
0.0075125990565410835
[1.4285714285714286, 5.857142857142857, 8.0, 7.714285714285714, 8.0, 6.428571428571429, 6.142857142857143, 9.285714285714286, 7.428571428571429, 4.428571428571429, 7.714285714285714, 5.571428571428571]
[0.7412493166611012, 3.0391221983105146, 4.1509961733021665, 4.002746309969947, 4.1509961733021665, 3.3356219249749555, 3.1873720616427352, 4.818120558297158, 3.8544964466377265, 2.297872881649414, 4.002746309969947, 2.8908723349782943]

-------------------------------
Custom STAC
-------------------------------

(2.9833333333333334, 0.002844721456970456, 25.571428571428573, 0.0075125990565411945, [[1.0, 6.0, 12.0, 10.0, 3.0, 2.0, 7.0, 11.0, 9.0, 5.0, 8.0, 4.0], [3.0, 8.0, 7.0, 9.0, 11.0, 1.0, 5.0, 2.0, 4.0, 10.0, 6.0, 12.0], [1.0, 7.0, 10.0, 6.0, 11.0, 8.0, 2.0, 9.0, 12.0, 3.0, 5.0, 4.0], [1.0, 2.0, 10.0, 4.0, 9.0, 7.

'for key in nemenyi_scores.keys():\n  pairwise_plotter(nemenyi_scores[key])'