In [1]:
# How changing the dataset sample size effects the calibration methods
# Fix training dataset size and change the calib set samples - best method is one that gets max calib with least data

# imports
import sys
import pandas as pd
sys.path.append('../../') # to access the files in higher directories
sys.path.append('../') # to access the files in higher directories
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import core_exp as cx
import core_calib as cal

In [2]:
params = {
    # exp
    "runs": 15,
    "exp_name": "split_test_real",
    "split": "random_split", #CV
    "test_split": 0.3,
    "calib_split": 0.1,
    "cv_folds": 5,
    "plot": True,
    "calib_methods": ["RF_d", "RF_opt",
                      "Platt", "ISO", "Beta", "CRF", "VA",
                      "RF_ens_k", "RF_ens_r", "RF_large",
                      ],
    
    "metrics": ["acc", "brier", "ece", "logloss", "auc"],

    # calib param
    "bin_strategy": "uniform",
    "ece_bins": 20,
    "boot_size": 1000,
    "boot_count": 5,

    # RF hyper opt
    "hyper_opt": True,
    "opt_cv":5, 
    "opt_n_iter":20,
    "opt_top_K": 5,
    "search_space": {
                    "n_estimators": [100],
                    "max_depth": [2,3,4,5,6,7,8,10,15,20,30,40,50,60,100],
                    "criterion": ["gini", "entropy"],
                    "max_features": ["sqrt", "log2"],
                    "min_samples_split": [2,3,4,5],
                    "min_samples_leaf": [1,2,3],
                    "oob_score": [False]

                    },
    
    "n_estimators": 100,
    "oob": False,
}

exp_key = "data_name"
exp_values = [
              "vertebral", 
              "wilt",
              "parkinsons", 
              "heart",
              "wdbc",
            #   "bank", 
            #   "ionosphere", 
            #   "HRCompetencyScores",
              "spambase", 
            #   "QSAR", 
            #   "diabetes", 
            #   "breast", 
            #   "SPF",
            #   "hillvalley",
            #   "pc4",
            #   "scene",
            #   "Sonar_Mine_Rock_Data",
            #   "Customer_Churn",
            #   "jm1",
            #   "eeg",
              # "madelon",
              # "phoneme",
              # "nomao",
              ]
# exp_values = ["wilt"]

In [3]:
calib_results_dict, data_list = cx.run_exp(exp_key, exp_values, params)

exp_param vertebral done
exp_param wilt done
exp_param parkinsons done
exp_param heart done
exp_param wdbc done
exp_param spambase done


In [4]:
tables = cal.mean_and_ranking_table(calib_results_dict, 
                                    params["metrics"], 
                                    params["calib_methods"], 
                                    data_list, 
                                    mean_and_rank=True, 
                                    std=True)

In [5]:
import matplotlib.pyplot as plt
import numpy as np

def vialin_plot(results_dict, metrics, calib_methods, data_list):

    # save results as txt
    df_dict = {}
    for data in data_list:
        for metric in metrics:
            df = pd.DataFrame(columns=calib_methods)
            for method in calib_methods:
                df[method] = np.array(results_dict[data+ "_" + method + "_"+ metric])
            print("df", df.head())
            fig, ax1 = plt.subplots()
            ax1.violinplot(df, showmeans=True) 
            ax1.set_xticks(np.arange(len(calib_methods)+1), labels=[""]+ calib_methods)
            # Rotate the tick labels by 90 degrees
            plt.xticks(rotation = 90) 
            plt.savefig(f"results/vialin_plot/{data}_{metric}.pdf", format='pdf', transparent=True)
            plt.close()        
    return df_dict

In [6]:
vialin_plot(calib_results_dict, 
                params["metrics"], 
                params["calib_methods"], 
                data_list,)

df        RF_d    RF_opt     Platt       ISO      Beta       CRF        VA  RF_ens_k  RF_ens_r  RF_large
0  0.849462  0.838710  0.849462  0.817204  0.817204  0.827957  0.817204  0.860215  0.849462  0.849462
1  0.817204  0.827957  0.860215  0.860215  0.860215  0.827957  0.860215  0.827957  0.817204  0.827957
2  0.860215  0.860215  0.795699  0.806452  0.806452  0.860215  0.806452  0.849462  0.838710  0.870968
3  0.870968  0.881720  0.870968  0.881720  0.870968  0.881720  0.870968  0.838710  0.817204  0.838710
4  0.784946  0.784946  0.806452  0.817204  0.817204  0.774194  0.817204  0.784946  0.774194  0.774194
df        RF_d    RF_opt     Platt       ISO      Beta       CRF        VA  RF_ens_k  RF_ens_r  RF_large
0  0.102274  0.101916  0.108483  0.113530  0.101194  0.102119  0.110612  0.096642  0.097773  0.097125
1  0.097719  0.095889  0.108745  0.105884  0.100839  0.098616  0.112141  0.100127  0.100659  0.099362
2  0.110825  0.108784  0.131605  0.194114  0.178420  0.110813  0.140914  0.1

{}

In [7]:
for metric in params["metrics"]:
    print(metric, " ", params["calib_methods"][tables[metric].loc["Rank"].argmin()])
    tables[metric].round(5).to_latex(f"./results/{params['exp_name']}/{metric}.csv")


acc   RF_d
brier   RF_large
ece   RF_d
logloss   RF_large
auc   RF_large


  tables[metric].round(5).to_latex(f"./results/{params['exp_name']}/{metric}.csv")
  tables[metric].round(5).to_latex(f"./results/{params['exp_name']}/{metric}.csv")
  tables[metric].round(5).to_latex(f"./results/{params['exp_name']}/{metric}.csv")
  tables[metric].round(5).to_latex(f"./results/{params['exp_name']}/{metric}.csv")
  tables[metric].round(5).to_latex(f"./results/{params['exp_name']}/{metric}.csv")


In [8]:
for metric in params["metrics"]:
    print(metric, " ", params["calib_methods"][tables[metric].loc["Rank"].argmin()])
    tables[metric+ "_std"].round(5).to_latex(f"./results/{params['exp_name']}/{metric}_std.csv")


acc   RF_d
brier   RF_large
ece   RF_d
logloss   RF_large
auc   RF_large


  tables[metric+ "_std"].round(5).to_latex(f"./results/{params['exp_name']}/{metric}_std.csv")
  tables[metric+ "_std"].round(5).to_latex(f"./results/{params['exp_name']}/{metric}_std.csv")
  tables[metric+ "_std"].round(5).to_latex(f"./results/{params['exp_name']}/{metric}_std.csv")
  tables[metric+ "_std"].round(5).to_latex(f"./results/{params['exp_name']}/{metric}_std.csv")
  tables[metric+ "_std"].round(5).to_latex(f"./results/{params['exp_name']}/{metric}_std.csv")


In [9]:
tables["brier"].round(4)


Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,RF_ens_k,RF_ens_r,RF_large
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
vertebral,0.1065,0.1075,0.1187,0.1392,0.1329,0.1173,0.1216,0.1062,0.1072,0.1066
wilt,0.013,0.0127,0.0133,0.0134,0.0126,0.0135,0.0127,0.0128,0.0126,0.0126
parkinsons,0.0763,0.0778,0.0877,0.0957,0.0982,0.081,0.0938,0.0792,0.0776,0.0776
heart,0.1322,0.1291,0.1361,0.1531,0.1389,0.1299,0.1432,0.1286,0.1289,0.1285
wdbc,0.0343,0.0331,0.0357,0.0379,0.0366,0.0327,0.0399,0.0336,0.0334,0.0331
spambase,0.0416,0.0412,0.0389,0.0409,0.0416,0.0403,0.0404,0.0428,0.0407,0.0409
Mean,0.0673,0.0669,0.0717,0.0801,0.0768,0.0691,0.0753,0.0672,0.0667,0.0666
Rank,5.0,4.5,6.1667,8.6667,7.8333,5.0,7.1667,4.8333,3.3333,2.5


In [10]:
tables["brier_std"].round(5)


Unnamed: 0_level_0,RF_d_std,RF_opt_std,Platt_std,ISO_std,Beta_std,CRF_std,VA_std,RF_ens_k_std,RF_ens_r_std,RF_large_std
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
vertebral,0.01746,0.01773,0.02219,0.03838,0.04055,0.02444,0.02109,0.017,0.01695,0.01747
wilt,0.00158,0.00173,0.00248,0.00249,0.0021,0.00192,0.00156,0.00159,0.00153,0.00155
parkinsons,0.01474,0.01515,0.02233,0.03615,0.03758,0.02562,0.02623,0.01601,0.01549,0.01569
heart,0.01641,0.01565,0.01964,0.02355,0.02183,0.02175,0.02179,0.01514,0.0161,0.01674
wdbc,0.00902,0.00879,0.01182,0.01641,0.01569,0.01198,0.01273,0.00892,0.0088,0.00855
spambase,0.00266,0.00247,0.00298,0.00284,0.00908,0.00288,0.00275,0.00242,0.00236,0.00235


In [11]:
tables["logloss"].round(4)

Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,RF_ens_k,RF_ens_r,RF_large
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
vertebral,0.3302,0.3309,0.3795,2.7781,1.1811,0.3601,0.3821,0.3283,0.3309,0.3289
wilt,0.0619,0.0598,0.0545,0.2117,0.0636,0.0596,0.0491,0.054,0.0516,0.0516
parkinsons,0.2501,0.2523,0.3023,1.7412,1.1978,0.2585,0.3148,0.2571,0.2531,0.2539
heart,0.4395,0.4086,0.4348,2.2859,0.785,0.4105,0.4479,0.4067,0.4068,0.4062
wdbc,0.2117,0.1717,0.1474,0.6452,0.2786,0.1787,0.1587,0.1384,0.1253,0.1356
spambase,0.1733,0.1776,0.1439,0.2501,0.1683,0.1637,0.1482,0.1632,0.1629,0.1604
Mean,0.2445,0.2335,0.2437,1.3187,0.6124,0.2385,0.2501,0.2246,0.2218,0.2228
Rank,5.8333,5.3333,5.0,10.0,8.6667,6.0,5.3333,3.3333,3.0,2.5


In [12]:
tables["ece"].round(4)

Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,RF_ens_k,RF_ens_r,RF_large
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
vertebral,0.0072,0.0057,0.004,0.0346,0.0139,0.0121,0.0222,0.0085,0.0074,0.0066
wilt,0.0232,0.024,0.0366,0.0147,0.0057,0.0286,0.0091,0.0261,0.0241,0.0262
parkinsons,0.0096,0.0123,0.0128,0.0761,0.0307,0.0211,0.0231,0.0099,0.008,0.0125
heart,0.0033,0.0079,0.0053,0.0609,0.0082,0.0051,0.0182,0.0073,0.0067,0.0062
wdbc,0.0123,0.0132,0.0163,0.019,0.014,0.0134,0.0091,0.011,0.01,0.0135
spambase,0.0103,0.01,0.0041,0.0044,0.0061,0.0077,0.0026,0.0106,0.0103,0.0106
Mean,0.011,0.0122,0.0132,0.0349,0.0131,0.0146,0.014,0.0122,0.0111,0.0126
Rank,3.6667,4.8333,5.1667,7.6667,6.3333,6.0,5.0,5.6667,4.5,6.1667


In [13]:
tables["acc"].round(4)

Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,RF_ens_k,RF_ens_r,RF_large
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
vertebral,0.8387,0.8315,0.8272,0.8251,0.8272,0.8258,0.8287,0.8315,0.8244,0.8258
wilt,0.9829,0.983,0.9826,0.9828,0.984,0.982,0.9826,0.9827,0.9829,0.9828
parkinsons,0.8949,0.8927,0.8859,0.8678,0.8757,0.8904,0.8655,0.8927,0.8949,0.896
heart,0.8168,0.8154,0.8176,0.7941,0.8147,0.822,0.8029,0.8176,0.8161,0.8176
wdbc,0.9587,0.9591,0.9552,0.9524,0.954,0.9575,0.9501,0.9598,0.9598,0.9626
spambase,0.9504,0.9503,0.9485,0.9482,0.946,0.9489,0.9485,0.9472,0.9504,0.9498
Mean,0.9071,0.9053,0.9028,0.8951,0.9003,0.9044,0.8964,0.9053,0.9048,0.9058
Rank,3.25,3.8333,6.25,8.5,6.6667,6.0,8.0833,4.6667,4.25,3.5


In [14]:
import scipy.stats as stats
import numpy as np

ece_ranks = np.array(tables["ece"].loc["Rank"])
brier_ranks = np.array(tables["brier"].loc["Rank"])
logloss_ranks = np.array(tables["logloss"].loc["Rank"])
acc_ranks = np.array(tables["acc"].loc["Rank"])

tau, p_value = stats.kendalltau(brier_ranks, acc_ranks)
print(f"tau {tau} p_value {p_value}")

tau 0.7191465199607915 p_value 0.004057136032371292
