In [1]:
# How changing the dataset sample size effects the calibration methods
# Fix training dataset size and change the calib set samples - best method is one that gets max calib with least data

# imports
import sys
import numpy as np
import pandas as pd
sys.path.append('../../') # to access the files in higher directories
sys.path.append('../') # to access the files in higher directories
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import core_exp as cx
import core_calib as cal
import core_tools as ct

import warnings
from sklearn.exceptions import DataConversionWarning

# Ignore all warnings from scikit-learn
warnings.filterwarnings("ignore", module="sklearn")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
# Add more categories if needed



In [2]:
params = {
    # exp
    "seed": 0,
    "runs": 5,
    "exp_name": ct.generate_readable_short_id("Synthetic_data_ECEvsBS"), #"main_run5_cv5_21data_100trees_40opt_fast",
    "path": "../../",
    "split": "CV", #CV, random_split
    "cv_folds": 10,
    "plot": False,

    "calib_method_colors": {
        "RF_d": "blue", 
        "RF_opt": "orange", 
        "RF_large": "red",
        "Platt": "Brown", 
        "ISO": "purple", 
        "Beta": "magenta", 
        "VA": "gray",
        "CT": "slategray",
        "PPA": "olive", 
        "Rank": "silver"     
    },

    "calib_methods": [
                      "RF_d",
                      "RF_opt", 
                      "RF_large", 
                      "Platt", "ISO", "Beta", "VA",
                      "CT", "PPA",
                      "Rank"
                      ],

    "metrics": ["acc", "brier", "ece", "logloss", "tce_mse", "time"],

    "plot_data": False,
    "data_size": 1000,
    "n_features": 10,


    # calib param   
    "bin_strategy": "uniform",
    "ece_bins": 20,
    "boot_size": 1000, 
    "boot_count": 5,

    # RF hyper opt
    "hyper_opt": True,
    "opt_cv":5, 
    "opt_n_iter":50,
    "opt_top_K": 5,
    "search_space": {
                    "n_estimators": [10],
                    "max_depth":  np.arange(2, 100).tolist() + [None], # [None, 5, 10, 15, 20, 30, 50], #
                    "criterion": ["gini", "entropy"],
                    "max_features": ['sqrt', 'log2', None],
                    "min_samples_split": np.arange(2, 11).tolist(),
                    "min_samples_leaf":  np.arange(1, 11).tolist(),
                    'class_weight': [None, 'balanced', 'balanced_subsample'],  # Weight for each class
                    "oob_score": [False],
                    "laplace": [0,1]
                    },
    
    "oob": False,
    "laplace":1, # if set to 1, OOB must be false
    "curt_v": np.arange(1, 100).tolist(),

    "exp_key": "data_name",
    "exp_values": [
                  "synthetic_fx1", 
                  "synthetic_fx2", 
                  "synthetic_fx3", 
                  "synthetic_fx4", 
                  "synthetic_fx5", 
                  "synthetic_fx6", 
                  "synthetic_fx7", 
                  "synthetic_fx8", 
                  "synthetic_fx9", 
                  "synthetic_fx10", 
                  "synthetic_fx11", 
                  "synthetic_fx12", 
                  "synthetic_fx13", 
                  "synthetic_fx14", 
                  "synthetic_fx15", 
                  "synthetic_fx16", 
                  "synthetic_fx17", 
                  "synthetic_fx18", 
                  "synthetic_fx19", 
                  "synthetic_fx20", 
                  "synthetic_fx21", 
                  "synthetic_fx22", 
                  "synthetic_fx23", 
                  "synthetic_fx24", 
                  "synthetic_fx25", 
                  "synthetic_fx26", 
                  "synthetic_fx27", 
                  "synthetic_fx28", 
                  "synthetic_fx29", 
                  "synthetic_fx30", 
                  ]
}

ct.save_params(params)

In [3]:
calib_results_dict, data_list = cx.run_exp(params["exp_key"], params["exp_values"], params)
ct.save_results(calib_results_dict, params['exp_name'])

exp_param synthetic_fx1 done
exp_param synthetic_fx2 done
exp_param synthetic_fx3 done
exp_param synthetic_fx4 done
exp_param synthetic_fx5 done
exp_param synthetic_fx6 done
exp_param synthetic_fx7 done
exp_param synthetic_fx8 done
exp_param synthetic_fx9 done
exp_param synthetic_fx10 done
exp_param synthetic_fx11 done
exp_param synthetic_fx12 done
exp_param synthetic_fx13 done
exp_param synthetic_fx14 done
exp_param synthetic_fx15 done
exp_param synthetic_fx16 done
exp_param synthetic_fx17 done
exp_param synthetic_fx18 done
exp_param synthetic_fx19 done
exp_param synthetic_fx20 done
exp_param synthetic_fx21 done
exp_param synthetic_fx22 done
exp_param synthetic_fx23 done
exp_param synthetic_fx24 done
exp_param synthetic_fx25 done
exp_param synthetic_fx26 done
exp_param synthetic_fx27 done
exp_param synthetic_fx28 done
exp_param synthetic_fx29 done
exp_param synthetic_fx30 done


In [4]:
tables = cal.mean_and_ranking_table(calib_results_dict, 
                                    params["metrics"], 
                                    params["calib_methods"], 
                                    data_list, 
                                    mean_and_rank=True, 
                                    std=True)
ct.save_metrics_to_csv(tables, params["metrics"], params['exp_name'])

In [5]:
ct.res_statistics(tables, params["metrics"], f"results/{params['exp_name']}", colors=params["calib_method_colors"])

metric acc
Friedman Test Statistic: 163.4924165824064
P-value: 1.39581452931678e-30
The differences between groups are significant.
metric brier
Friedman Test Statistic: 219.4181818181819
P-value: 2.7739641804411955e-42
The differences between groups are significant.
metric ece
Friedman Test Statistic: 168.6400000000001
P-value: 1.1846416782860017e-31
The differences between groups are significant.
metric logloss
Friedman Test Statistic: 221.85454545454536
P-value: 8.524757559917429e-43
The differences between groups are significant.
metric tce_mse
Friedman Test Statistic: 238.51636363636362
P-value: 2.6407004379058386e-46
The differences between groups are significant.
metric time
Friedman Test Statistic: 258.8072727272727
P-value: 1.376249178713831e-50
The differences between groups are significant.


In [6]:
ct.save_metrics_to_latex(tables, params["metrics"], params['exp_name'])