In [1]:
# How changing the dataset sample size effects the calibration methods
# Fix training dataset size and change the calib set samples - best method is one that gets max calib with least data

# imports
import sys
import numpy as np
import pandas as pd
sys.path.append('../../') # to access the files in higher directories
sys.path.append('../') # to access the files in higher directories
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import core_exp as cx
import core_calib as cal
import core_tools as ct

import warnings
from sklearn.exceptions import DataConversionWarning

# Ignore all warnings from scikit-learn
warnings.filterwarnings("ignore", module="sklearn")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
# Add more categories if needed



In [2]:
params = {
    # exp
    "seed": 0,
    "runs": 5,
    "exp_name": ct.generate_readable_short_id("test"), #"main_run5_cv5_21data_100trees_40opt_fast",
    "path": "../../",
    "split": "CV", #CV, random_split
    "cv_folds": 5,
    "plot": False,

    "calib_methods": [
                      # "RF_d",
                      "RF_opt", 
                      # "RF_large", 
                      "Platt", "ISO", "Beta", "VA",
                      "CT", "PPA",
                      # "Rank",
                      "Shaker"
                      ],

    "calib_method_colors": {
        # "RF_d": "blue", 
        "RF_opt": "orange", 
        # "RF_large": "red",
        "Platt": "Brown", 
        "ISO": "purple", 
        "Beta": "magenta", 
        "VA": "gray",
        "CT": "slategray",
        "PPA": "olive", 
        # "Rank": "silver"     
        "Shaker": "gold",         

    },

    # "calib_method_colors": {
    #     "RF_d": "blue", 
    #     "RF_opt": "orange", 
    #     "RF_large": "red",
    #     "DNN_ens": "olive", 
    #     "XGB": "olive", 
    #     "DT": "black", 
    #     "LR": "black", 
    #     "SVM": "black", 
    #     "DNN": "black",
    #     "GNB": "black",
    # },

    # "calib_methods": [
    #     "RF_d",
    #     # "Shaker"
    #     # "RF_opt", 
    #     # "RF_large",
    #     # "DNN_ens", 
    #     # "XGB_opt",
    #     # "DT_opt", 
    #     # "LR_opt", "SVM_opt", "DNN_opt", 
    #     # "GNB_opt"          
    #                   ],


    "metrics": ["acc", "brier", "ece", "logloss", "time"],

    # calib param   
    "bin_strategy": "uniform",
    "ece_bins": 20,
    "boot_size": 1000, 
    "boot_count": 5,

    # RF hyper opt
    "hyper_opt": True,
    "opt_cv":5, 
    "opt_n_iter":50,
    "opt_top_K": 5,
    "search_space": {
                    "n_estimators": [10],
                    "max_depth":  np.arange(2, 100).tolist() + [None], # [None, 5, 10, 15, 20, 30, 50], #
                    "criterion": ["gini", "entropy"],
                    "max_features": ['sqrt', 'log2', None],
                    "min_samples_split": np.arange(2, 11).tolist(),
                    "min_samples_leaf":  np.arange(1, 11).tolist(),
                    'class_weight': [None, 'balanced', 'balanced_subsample'],  # Weight for each class
                    "oob_score": [False],
                    "laplace": [0,1]
                    },
    
    "oob": False,
    "laplace":1, # if set to 1, OOB must be false
    "curt_v": np.arange(1, 100).tolist(),

    "exp_key": "data_name",
    "exp_values": [
                  "cm1",
                  "datatrieve",
                  # "kc1_class_level_defectiveornot",
                  # "kc1",
                  # "kc2",
                  # "kc3",
                  # "pc1",
                  # "spect",
                  # "spectf",
                  # "vertebral", 
                  # "wilt",
                  # "parkinsons", 
                  # "heart",
                  # "wdbc",
                  # "bank", 
                  # "ionosphere", 
                #   "HRCompetencyScores",
                #   "spambase", 
                #   "QSAR", 
                #   "diabetes", 
                #   "breast", 
                #   "SPF",
                #   "hillvalley",
                #   "pc4",
                #   "scene",
                #   "Sonar_Mine_Rock_Data",
                #   "Customer_Churn",
                #   "jm1",
                #   "eeg",
                #   "phoneme",


                  # "madelon",
                  # "nomao",
                  ]
}

ct.save_params(params)

In [3]:
calib_results_dict, data_list = cx.run_exp(params["exp_key"], params["exp_values"], params)
ct.save_results(calib_results_dict, params['exp_name'])

exp_param cm1 done
exp_param datatrieve done


In [4]:
tables = cal.mean_and_ranking_table(calib_results_dict, 
                                    params["metrics"], 
                                    params["calib_methods"], 
                                    data_list, 
                                    mean_and_rank=True, 
                                    std=True)
ct.save_metrics_to_csv(tables, params["metrics"], params['exp_name'])

In [9]:
tables["brier"]

Unnamed: 0_level_0,RF_opt,Platt,ISO,Beta,VA,CT,PPA,Shaker
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
cm1,0.157071,0.086856,0.092439,0.087311,0.090649,0.094908,0.095265,0.084664
datatrieve,0.153705,0.080371,0.080171,0.082509,0.090796,0.082831,0.090372,0.075995
Mean,0.155388,0.083613,0.086305,0.08491,0.090723,0.08887,0.092818,0.080329
Rank,8.0,2.5,3.5,3.5,5.5,5.5,6.5,1.0


In [5]:
ct.res_statistics(tables, params["metrics"], f"results/{params['exp_name']}", colors=params["calib_method_colors"])

metric acc
Friedman Test Statistic: 9.445783132530115
P-value: 0.22221445119642474
The differences between groups are significant.
metric brier
Friedman Test Statistic: 12.166666666666657
P-value: 0.09520945061443753
The differences between groups are significant.
metric ece
Friedman Test Statistic: 8.833333333333329
P-value: 0.26484461657355635
The differences between groups are significant.
metric logloss
Friedman Test Statistic: 13.5
P-value: 0.06082339471022093
The differences between groups are significant.
metric time
Friedman Test Statistic: 11.166666666666657
P-value: 0.1315125183368965
The differences between groups are significant.


In [6]:
ct.save_metrics_to_latex(tables, params["metrics"], params['exp_name'])

In [7]:
calib_results_dict.keys()

dict_keys(['cm1_RF_opt_runtime', 'cm1_Platt_runtime', 'cm1_ISO_runtime', 'cm1_PPA_runtime', 'cm1_Shaker_runtime', 'cm1_VA_runtime', 'cm1_Beta_runtime', 'cm1_CT_runtime', 'cm1_RF_opt_time', 'cm1_Platt_time', 'cm1_ISO_time', 'cm1_Beta_time', 'cm1_VA_time', 'cm1_CT_time', 'cm1_PPA_time', 'cm1_Shaker_time', 'cm1_RF_opt_acc', 'cm1_Platt_acc', 'cm1_ISO_acc', 'cm1_Beta_acc', 'cm1_VA_acc', 'cm1_CT_acc', 'cm1_PPA_acc', 'cm1_Shaker_acc', 'cm1_RF_opt_ece', 'cm1_Platt_ece', 'cm1_ISO_ece', 'cm1_Beta_ece', 'cm1_VA_ece', 'cm1_CT_ece', 'cm1_PPA_ece', 'cm1_Shaker_ece', 'cm1_RF_opt_brier', 'cm1_Platt_brier', 'cm1_ISO_brier', 'cm1_Beta_brier', 'cm1_VA_brier', 'cm1_CT_brier', 'cm1_PPA_brier', 'cm1_Shaker_brier', 'cm1_RF_opt_logloss', 'cm1_Platt_logloss', 'cm1_ISO_logloss', 'cm1_Beta_logloss', 'cm1_VA_logloss', 'cm1_CT_logloss', 'cm1_PPA_logloss', 'cm1_Shaker_logloss', 'datatrieve_RF_opt_runtime', 'datatrieve_Platt_runtime', 'datatrieve_ISO_runtime', 'datatrieve_PPA_runtime', 'datatrieve_Shaker_runtime', '