In [1]:
# How changing the dataset sample size effects the calibration methods
# Fix training dataset size and change the calib set samples - best method is one that gets max calib with least data

# imports
import sys
import numpy as np
import pandas as pd
sys.path.append('../../') # to access the files in higher directories
sys.path.append('../') # to access the files in higher directories
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import core_exp as cx
import core_calib as cal
import core_tools as ct

In [2]:
params = {
    # exp
    "seed": 0,
    "runs": 5,
    "exp_name": ct.generate_readable_short_id("bostrom_laplace0"), #"main_run5_cv5_21data_100trees_40opt_fast",
    "path": "../../",
    "split": "CV", #CV, random_split
    "cv_folds": 5,
    "plot": False,
    "calib_methods": ["RF_d", "RF_opt",
                      "Platt", "ISO", "Beta", "CRF", "VA",
                      "tlr", "Rank", #"Elkan"
                      # "RF_ens_k", "RF_ens_r", 
                      "RF_large",
                      ],
    
    "metrics": ["acc", "brier", "ece", "logloss"],

    # calib param
    "bin_strategy": "uniform",
    "ece_bins": 20,
    "boot_size": 1000, 
    "boot_count": 5,

    # RF hyper opt
    "hyper_opt": True,
    "opt_cv":5, 
    "opt_n_iter":50,
    "opt_top_K": 5,
    "search_space": {
                    "n_estimators": [10],
                    "max_depth": np.arange(2, 100).tolist(),
                    "criterion": ["gini", "entropy"],
                    "max_features": ['sqrt', 'log2', None],
                    "min_samples_split": np.arange(2, 11).tolist(),
                    "min_samples_leaf": np.arange(1, 11).tolist(),
                    # 'bootstrap': [True, False],                # Whether bootstrap samples are used
                    'class_weight': [None, 'balanced', 'balanced_subsample'],  # Weight for each class
                    "oob_score": [False]
                    },
    
    "oob": False,
    "laplace": 0, # if set to 1, OOB must be false

    "exp_key": "data_name",
    "exp_values": [
                  "vertebral", 
                  "wilt",
                  "parkinsons", 
                  "heart",
                  "wdbc",
                  "bank", 
                  "ionosphere", 
                  "HRCompetencyScores",
                  "spambase", 
                  "QSAR", 
                  "diabetes", 
                  "breast", 
                  "SPF",
                  "hillvalley",
                  "pc4",
                  "scene",
                  "Sonar_Mine_Rock_Data",
                  "Customer_Churn",
                  "jm1",
                  "eeg",
                  # "madelon",
                  # "phoneme",
                  # "nomao",
                  ]
}

ct.save_params(params)

In [3]:
calib_results_dict, data_list = cx.run_exp(params["exp_key"], params["exp_values"], params)
ct.save_results(calib_results_dict, params['exp_name'])

exp_param vertebral done
exp_param wilt done
exp_param parkinsons done
exp_param heart done
exp_param wdbc done
exp_param bank done
exp_param ionosphere done
exp_param HRCompetencyScores done




exp_param spambase done
exp_param QSAR done
exp_param diabetes done
exp_param breast done
exp_param SPF done
exp_param hillvalley done
exp_param pc4 done
exp_param scene done
exp_param Sonar_Mine_Rock_Data done
exp_param Customer_Churn done




exp_param jm1 done




exp_param eeg done


In [4]:
tables = cal.mean_and_ranking_table(calib_results_dict, 
                                    params["metrics"], 
                                    params["calib_methods"], 
                                    data_list, 
                                    mean_and_rank=True, 
                                    std=True)
ct.save_metrics_to_csv(tables, params["metrics"], params['exp_name'])

In [5]:
tables["brier"].round(4)


Unnamed: 0_level_0,RF_d,RF_opt,Platt,ISO,Beta,CRF,VA,tlr,Rank,RF_large
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
vertebral,0.1172,0.1098,0.1234,0.1283,0.1244,0.1229,0.1232,0.1273,0.1459,0.1131
wilt,0.0144,0.0125,0.0138,0.0138,0.0148,0.0131,0.0134,0.0143,0.0173,0.0119
parkinsons,0.0817,0.0958,0.1029,0.1083,0.107,0.1027,0.1017,0.1086,0.1295,0.0911
heart,0.1457,0.1393,0.1411,0.1494,0.1427,0.1438,0.143,0.144,0.1533,0.1315
wdbc,0.0366,0.035,0.0389,0.0404,0.0398,0.0392,0.0398,0.0393,0.0559,0.0328
bank,0.0072,0.0076,0.0077,0.0091,0.0087,0.008,0.0088,0.0098,0.0245,0.0062
ionosphere,0.0615,0.0632,0.0669,0.0748,0.0685,0.0656,0.0708,0.0685,0.0872,0.0591
HRCompetencyScores,0.071,0.0703,0.0752,0.0791,0.08,0.0745,0.0764,0.0774,0.0884,0.0689
spambase,0.0457,0.0459,0.0449,0.0455,0.0505,0.0461,0.0453,0.0496,0.0461,0.0415
QSAR,0.1076,0.1043,0.1098,0.1129,0.1148,0.1095,0.1118,0.1157,0.115,0.0983


In [6]:
ct.res_statistics(tables, params["metrics"], f"results/{params['exp_name']}")

metric acc
Friedman Test Statistic: 95.19446172993159
P-value: 1.4692708214968574e-16
The differences between groups are significant.
metric brier
Friedman Test Statistic: 102.04194528875382
P-value: 6.075313130907481e-18
The differences between groups are significant.
metric ece
Friedman Test Statistic: 46.83282674772042
P-value: 4.2162965408832665e-07
The differences between groups are significant.
metric logloss
Friedman Test Statistic: 118.58662613981765
P-value: 2.6005797554318194e-21
The differences between groups are significant.
