In [1]:
# How changing the dataset sample size effects the calibration methods
# Fix training dataset size and change the calib set samples - best method is one that gets max calib with least data

# imports
import sys
import pandas as pd
sys.path.append('../../') # to access the files in higher directories
sys.path.append('../') # to access the files in higher directories
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import core_exp as cx
import core_calib as cal

In [2]:
params = {
    # exp
    "runs": 5,
    "exp_name": "real_OOB_False",
    "cv_folds": 5,
    "plot": True,
    "calib_methods": ["RF_d", "RF_opt",
                      "Platt", "ISO", "Beta", "CRF", "VA",
                      "RF_ens_k", "RF_ens_r", "RF_large",
                      ],
    
    "metrics": ["acc", "brier", "ece", "logloss", "auc"],

    # calib param
    "bin_strategy": "uniform",
    "ece_bins": 20,
    "boot_size": 1000,
    "boot_count": 5,

    # RF hyper opt
    "hyper_opt": True,
    "opt_cv":5, 
    "opt_n_iter":20,
    "opt_top_K": 5,
    "search_space": {
                    "n_estimators": [100],
                    "max_depth": [2,3,4,5,6,7,8,10,15,20,30,40,50,60,100],
                    "criterion": ["gini", "entropy"],
                    "max_features": ["sqrt", "log2"],
                    "min_samples_split": [2,3,4,5],
                    "min_samples_leaf": [1,2,3],
                    "oob_score": [False]

                    },
    
    "n_estimators": 100,
    "oob": False,
}

exp_key = "data_name"
exp_values = [
              "vertebral", 
              "wilt",
              "parkinsons", 
              "heart",
              "wdbc",
              "bank", 
              "ionosphere", 
              "HRCompetencyScores",
              "spambase", 
              "QSAR", 
              # "diabetes", 
              # "breast", 
              # "SPF",
              # "hillvalley",
              # "pc4",
              # "scene",
              # "Sonar_Mine_Rock_Data",
              # "Customer_Churn",
              # "jm1",
              # "eeg",
              # "madelon",
              # "phoneme",
              # "nomao",
              ]
# exp_values = ["wilt"]

In [3]:
calib_results_dict, data_list = cx.run_exp(exp_key, exp_values, params)

exp_param vertebral done
exp_param wilt done
exp_param parkinsons done
exp_param heart done
exp_param wdbc done
exp_param bank done
exp_param ionosphere done
exp_param HRCompetencyScores done
exp_param spambase done
exp_param QSAR done


In [4]:
tables = cal.mean_and_ranking_table(calib_results_dict, 
                                    params["metrics"], 
                                    params["calib_methods"], 
                                    data_list, 
                                    mean_and_rank=True, 
                                    std=True)

In [5]:
for metric in params["metrics"]:
    print(metric, " ", params["calib_methods"][tables[metric].loc["Rank"].argmin()])
    tables[metric].to_csv(f"./results/{params['exp_name']}/{metric}.csv")


acc   RF_opt
brier   RF_ens_r
ece   RF_d
logloss   RF_ens_r
auc   RF_ens_r


In [6]:
tables["brier"].round(4)


Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,RF_ens_k,RF_ens_r,RF_large
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
vertebral,0.1072,0.109,0.1129,0.1167,0.1129,0.1105,0.1141,0.1067,0.107,0.1081
wilt,0.0129,0.0127,0.0134,0.0132,0.0131,0.0138,0.0128,0.0129,0.0127,0.0126
parkinsons,0.0711,0.0932,0.0922,0.0991,0.0968,0.0947,0.0946,0.0827,0.0899,0.0898
heart,0.1305,0.1276,0.1276,0.1351,0.1285,0.1295,0.1319,0.1258,0.1262,0.1272
wdbc,0.032,0.031,0.0335,0.0357,0.0347,0.0341,0.0355,0.0307,0.0305,0.0305
bank,0.0058,0.0058,0.0063,0.0066,0.0067,0.0067,0.0068,0.0059,0.0055,0.0056
ionosphere,0.0511,0.0519,0.0534,0.0563,0.055,0.0537,0.0551,0.0525,0.0523,0.0522
HRCompetencyScores,0.0628,0.0616,0.0659,0.0736,0.0719,0.0657,0.0712,0.0626,0.0622,0.0624
spambase,0.0391,0.0396,0.038,0.0388,0.0383,0.0396,0.0387,0.0408,0.039,0.0391
QSAR,0.0955,0.0952,0.0989,0.1021,0.0997,0.0989,0.101,0.0952,0.0948,0.0949


In [7]:
tables["logloss"].round(4)

Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,RF_ens_k,RF_ens_r,RF_large
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
vertebral,0.3329,0.3393,0.356,1.0261,0.3474,0.3384,0.3548,0.3308,0.3323,0.3351
wilt,0.0643,0.0577,0.0557,0.1299,0.0674,0.0679,0.0494,0.0547,0.0552,0.0548
parkinsons,0.239,0.2994,0.3012,1.2057,0.3257,0.3016,0.3051,0.2724,0.2915,0.2913
heart,0.4088,0.4019,0.4036,1.2978,0.4079,0.4056,0.4145,0.3981,0.3993,0.4017
wdbc,0.139,0.1365,0.1259,0.4258,0.1296,0.1422,0.1328,0.1145,0.1141,0.114
bank,0.0288,0.0272,0.0266,0.1111,0.0379,0.0286,0.0307,0.0292,0.0267,0.0269
ionosphere,0.2241,0.1897,0.1949,0.8489,0.2467,0.2116,0.1958,0.1949,0.194,0.1932
HRCompetencyScores,0.3324,0.3283,0.2375,1.0722,0.3331,0.2796,0.2512,0.2277,0.2257,0.2672
spambase,0.1791,0.1718,0.1407,0.1898,0.1569,0.1655,0.1426,0.1577,0.1574,0.159
QSAR,0.3497,0.339,0.3238,0.6432,0.3413,0.3343,0.3296,0.3162,0.3148,0.3264


In [8]:
tables["ece"].round(4)

Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,RF_ens_k,RF_ens_r,RF_large
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
vertebral,0.004,0.0041,0.0083,0.0125,0.0045,0.0059,0.0047,0.0053,0.0059,0.0063
wilt,0.0174,0.0188,0.0272,0.008,0.0035,0.0219,0.0065,0.0208,0.0204,0.0201
parkinsons,0.0129,0.0122,0.0096,0.0298,0.0149,0.0105,0.0276,0.013,0.0107,0.0149
heart,0.0025,0.0051,0.0039,0.0037,0.0046,0.0032,0.0028,0.0051,0.0055,0.0053
wdbc,0.011,0.0114,0.0247,0.0199,0.0159,0.0087,0.0138,0.0114,0.0111,0.0119
bank,0.0209,0.0137,0.029,0.0542,0.0726,0.0259,0.0304,0.016,0.0184,0.0196
ionosphere,0.0137,0.0119,0.0196,0.0491,0.0134,0.0366,0.0165,0.0158,0.0152,0.0154
HRCompetencyScores,0.0193,0.0162,0.0068,0.0414,0.0174,0.0166,0.0098,0.0179,0.0166,0.0155
spambase,0.0101,0.0095,0.0021,0.0008,0.0016,0.0174,0.0009,0.011,0.0102,0.0107
QSAR,0.0032,0.0036,0.0015,0.0028,0.0015,0.0019,0.002,0.0032,0.0031,0.0029


In [9]:
tables["acc"].round(4)

Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,RF_ens_k,RF_ens_r,RF_large
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
vertebral,0.8368,0.8374,0.8316,0.8265,0.8355,0.8258,0.8303,0.8303,0.8303,0.8271
wilt,0.983,0.9835,0.9835,0.983,0.983,0.9819,0.9828,0.9831,0.9835,0.9836
parkinsons,0.9097,0.8697,0.8687,0.8615,0.8585,0.8656,0.8656,0.8913,0.8759,0.8738
heart,0.8159,0.8232,0.8178,0.8079,0.8119,0.8198,0.8106,0.8271,0.8245,0.8225
wdbc,0.9589,0.9614,0.9512,0.9522,0.9526,0.9529,0.9522,0.961,0.9607,0.9603
bank,0.9931,0.9926,0.9915,0.9911,0.9908,0.9914,0.9907,0.9923,0.9929,0.993
ionosphere,0.9339,0.9305,0.9334,0.9259,0.93,0.9322,0.9277,0.9357,0.9374,0.9362
HRCompetencyScores,0.9273,0.928,0.9193,0.914,0.9107,0.92,0.912,0.9287,0.926,0.9247
spambase,0.9543,0.9522,0.9503,0.95,0.9507,0.9509,0.9502,0.9512,0.9534,0.9535
QSAR,0.8694,0.8715,0.8624,0.8578,0.8599,0.8645,0.8571,0.8684,0.8686,0.869


In [10]:
import scipy.stats as stats
import numpy as np

ece_ranks = np.array(tables["ece"].loc["Rank"])
brier_ranks = np.array(tables["brier"].loc["Rank"])
logloss_ranks = np.array(tables["logloss"].loc["Rank"])
acc_ranks = np.array(tables["acc"].loc["Rank"])

tau, p_value = stats.kendalltau(brier_ranks, acc_ranks)
print(f"tau {tau} p_value {p_value}")

tau 0.7191465199607916 p_value 0.004057136032371292
