In [1]:
# How changing the dataset sample size effects the calibration methods
# Fix training dataset size and change the calib set samples - best method is one that gets max calib with least data

# imports
import sys
import pandas as pd
sys.path.append('../../') # to access the files in higher directories
sys.path.append('../') # to access the files in higher directories
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import core_exp as cx
import core_calib as cal

In [2]:
params = {
    # exp
    "runs": 5,
    "exp_name": "real",
    "cv_folds": 3,
    "plot": True,
    "calib_methods": ["RF_d", "RF_opt",
                      "Platt", # "Platt_d", 
                      "ISO", "Beta", "CRF", "VA",
                      "RF_ens_k"#, "RF_large",
                      ],
    
    "metrics": ["acc", "brier", "ece", "logloss", "auc"],

    # calib param
    "bin_strategy": "uniform",
    "ece_bins": 20,
    "boot_size": 1000,
    "boot_count": 5,

    # RF hyper opt
    "hyper_opt": True,
    "opt_cv":5, 
    "opt_n_iter":40,
    "opt_top_K": 5,
    "search_space": {
                    "n_estimators": [20],
                    "max_depth": [2,3,4,5,6,7,8,10,15,20,30,40,50,60,100],
                    "criterion": ["gini", "entropy"],
                    "max_features": ["sqrt", "log2"],
                    "min_samples_split": [2,3,4,5],
                    "min_samples_leaf": [1,2,3],
                    },
    
    "n_estimators": 20
}

exp_key = "data_name"
exp_values = [
              "vertebral", 
              "wilt",
              "parkinsons", 
              "heart",
              "wdbc",
              "bank", 
              "ionosphere", 
              "HRCompetencyScores",
              "spambase", 
              "QSAR", 
              # "diabetes", 
              # "breast", 
              # "SPF",
              # "hillvalley",
              # "pc4",
              # "scene",
              # "Sonar_Mine_Rock_Data",
              # "Customer_Churn",
              # "jm1",
              # "eeg",
              # "madelon",
              # "phoneme",
              # "nomao",
              ]
# exp_values = ["wilt"]

In [3]:
calib_results_dict, data_list = cx.run_exp(exp_key, exp_values, params)

In [4]:
tables = cal.mean_and_ranking_table(calib_results_dict, 
                                    params["metrics"], 
                                    params["calib_methods"], 
                                    data_list, 
                                    mean_and_rank=True, 
                                    std=True)

In [5]:
for metric in params["metrics"]:
    print(metric, " ", params["calib_methods"][tables[metric].loc["Rank"].argmin()])
    tables[metric].to_csv(f"./results/{params['exp_name']}/{metric}.csv")


acc   RF_ens_k
brier   RF_ens_k
ece   VA
logloss   RF_ens_k
auc   RF_ens_k


In [6]:
tables["brier"].round(4)


Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,RF_ens_k
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
vertebral,0.1158,0.1126,0.1277,0.1287,0.1257,0.128,0.1271,0.1079
wilt,0.0145,0.0141,0.0155,0.0153,0.0162,0.0164,0.0152,0.0138
parkinsons,0.0876,0.102,0.1085,0.1156,0.1121,0.1058,0.1108,0.0884
heart,0.1342,0.1346,0.135,0.1395,0.1362,0.1351,0.138,0.128
wdbc,0.0337,0.0315,0.0354,0.0386,0.0362,0.0369,0.0383,0.0325
bank,0.0081,0.0078,0.0112,0.012,0.0116,0.0122,0.0116,0.0071
ionosphere,0.0581,0.0576,0.0615,0.0627,0.0618,0.0641,0.063,0.0565
HRCompetencyScores,0.0677,0.0655,0.0679,0.0699,0.0736,0.069,0.0701,0.0632
spambase,0.0433,0.0435,0.0457,0.0463,0.0496,0.0475,0.0462,0.0432
QSAR,0.1022,0.1019,0.108,0.1095,0.1094,0.1086,0.1093,0.0981


In [7]:
tables["logloss"].round(4)

Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,RF_ens_k
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
vertebral,0.5016,0.4096,0.3986,0.681,0.4139,0.4537,0.3922,0.3371
wilt,0.1,0.0902,0.0642,0.0945,0.075,0.0992,0.0596,0.0582
parkinsons,0.3135,0.3188,0.3472,1.2971,0.3979,0.4011,0.3481,0.2846
heart,0.4562,0.4189,0.4228,0.7489,0.4662,0.466,0.429,0.4057
wdbc,0.2328,0.1811,0.1341,0.3795,0.2095,0.2406,0.1389,0.1324
bank,0.0362,0.0339,0.0441,0.1393,0.0443,0.053,0.0437,0.0336
ionosphere,0.3508,0.2804,0.2202,0.5786,0.2766,0.3201,0.225,0.2085
HRCompetencyScores,0.598,0.3151,0.2411,0.5568,0.3077,0.3666,0.2449,0.2268
spambase,0.263,0.2091,0.1656,0.197,0.2188,0.2253,0.1673,0.1713
QSAR,0.4878,0.3915,0.3527,0.4945,0.4015,0.4379,0.3561,0.3388


In [8]:
tables["ece"].round(4)

Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,RF_ens_k
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
vertebral,0.004,0.004,0.0062,0.0053,0.0023,0.004,0.0035,0.006
wilt,0.0163,0.0199,0.0141,0.0062,0.0179,0.0305,0.0052,0.0241
parkinsons,0.0061,0.0068,0.0101,0.0322,0.0118,0.0037,0.0187,0.0125
heart,0.0055,0.0042,0.0023,0.0041,0.0036,0.0042,0.0027,0.0061
wdbc,0.0106,0.0149,0.003,0.0251,0.0071,0.0053,0.011,0.015
bank,0.0195,0.0199,0.0256,0.042,0.0181,0.0283,0.0229,0.0171
ionosphere,0.0087,0.0099,0.0106,0.0153,0.015,0.0103,0.0095,0.0128
HRCompetencyScores,0.0169,0.0182,0.0074,0.0297,0.0199,0.0087,0.007,0.0223
spambase,0.0097,0.01,0.0013,0.0015,0.0048,0.0083,0.0011,0.0111
QSAR,0.0014,0.0018,0.0006,0.0012,0.0024,0.0025,0.0012,0.0042


In [9]:
tables["acc"].round(4)

Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,RF_ens_k
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
vertebral,0.818,0.831,0.8084,0.8065,0.8149,0.8123,0.813,0.8349
wilt,0.9807,0.9814,0.9806,0.9805,0.9786,0.9781,0.9808,0.9814
parkinsons,0.88,0.8513,0.8472,0.8318,0.839,0.8544,0.8318,0.8677
heart,0.8033,0.8092,0.8059,0.7974,0.798,0.804,0.798,0.8224
wdbc,0.9582,0.9649,0.9557,0.9455,0.9543,0.9529,0.9466,0.9599
bank,0.9918,0.9911,0.9853,0.9851,0.9848,0.9845,0.9853,0.9926
ionosphere,0.9276,0.9288,0.9208,0.9185,0.9214,0.9214,0.9191,0.9339
HRCompetencyScores,0.9213,0.928,0.9113,0.912,0.9013,0.912,0.91,0.9273
spambase,0.9485,0.9488,0.9405,0.9411,0.9371,0.9411,0.9409,0.9487
QSAR,0.8563,0.8584,0.8516,0.8495,0.8502,0.8521,0.8499,0.8646


In [10]:
import scipy.stats as stats
import numpy as np

ece_ranks = np.array(tables["ece"].loc["Rank"])
brier_ranks = np.array(tables["brier"].loc["Rank"])
logloss_ranks = np.array(tables["logloss"].loc["Rank"])
acc_ranks = np.array(tables["acc"].loc["Rank"])

tau, p_value = stats.kendalltau(brier_ranks, acc_ranks)
print(f"tau {tau} p_value {p_value}")

tau 0.836501912571304 p_value 0.0041367370986766456
