In [1]:
# Thsi is the code used for the paper results
# How changing the dataset sample size effects the calibration methods
# Fix training dataset size and change the calib set samples - best method is one that gets max calib with least data

# imports
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('../../') # to access the files in higher directories
sys.path.append('../') # to access the files in higher directories
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import core_exp as cx
import core_calib as cal
import core_tools as ct
import matplotlib.pyplot as plt


In [2]:
params = {
    # exp
    "seed": 0,
    "runs": 10,
    "exp_name": ct.generate_readable_short_id("synthetic_mg_100tree"),
    "path": "../../",
    "split": "random_split", #CV, random_split

    "test_split": 0.0588,
    "calib_split": 0.5,
    
    "plot": True,
    
    "calib_methods": [
                      "RF_d",
                      "RF_opt", 
                      "RF_large", 
                      "Platt", "ISO", "Beta", "VA",
                      "CT", "PPA",
                      "Rank"
                      ],

    "calib_method_colors": {
        "RF_d": "blue", 
        "RF_opt": "orange", 
        "RF_large": "red",
        "Platt": "Brown", 
        "ISO": "purple", 
        "Beta": "magenta", 
        "VA": "gray",
        "CT": "slategray",
        "PPA": "olive", 
        "Rank": "silver"     
    },

    
    "metrics": ["acc", "brier", "ece", "logloss", "tce_kl", "tce_mse", "prob_ent"], 

    #data
    "data_name": "synthetic_mg",
    "plot_data": True,
    "data_size": 1700,
    "n_features": 2,

    "bais_accuracy": 0,
    
    "class1_mean_min":0, 
    "class1_mean_max":1,
    "class2_mean_min":1, 
    "class2_mean_max":3, 

    "class1_cov_min":1, 
    "class1_cov_max":2,
    "class2_cov_min":1, 
    "class2_cov_max":2, 

    # "class1_mean_min":0, 
    # "class1_mean_max":1,
    # "class2_mean_min":1, 
    # "class2_mean_max":3, 

    # "class1_cov_min":4, 
    # "class1_cov_max":5,
    # "class2_cov_min":1, 
    # "class2_cov_max":2, 


    # calib param
    "bin_strategy": "uniform",
    "ece_bins": 20,
    "boot_size": 1000, 
    "boot_count": 5,

    # RF hyper opt
    "hyper_opt": True,
    "opt_cv":5, 
    "opt_n_iter":50,
    "opt_top_K": 5,
    "search_space": {
                    "n_estimators": [100],
                    "max_depth": np.arange(2, 100).tolist(),
                    "criterion": ["gini", "entropy"],
                    "max_features": ['sqrt', 'log2', None],
                    "min_samples_split": np.arange(2, 11).tolist(),
                    "min_samples_leaf":  np.arange(1, 11).tolist(),
                    # 'bootstrap': [True, False],                # Whether bootstrap samples are used
                    'class_weight': [None, 'balanced', 'balanced_subsample'],  # Weight for each class
                    "oob_score": [False],
                    },
    
    "oob": False,
    "laplace": 0, # if set to 1, OOB must be false
    "curt_v": np.arange(1, 100).tolist(),

    "exp_key": "calib_size",
    "exp_values":  np.arange(2, 100, 5).tolist()
}

ct.save_params(params)

In [3]:
calib_results_dict, data_list = cx.run_exp(params["exp_key"], params["exp_values"], params)
ct.save_results(calib_results_dict, params['exp_name'])

KeyboardInterrupt: 

In [None]:

plot_calib_methods = params["calib_methods"].copy()
# plot_calib_methods.remove("RF_d")
# plot_calib_methods.remove("RF_opt")
# plot_calib_methods.remove("RF_large")
# plot_calib_methods.remove("Platt")
# plot_calib_methods.remove("Beta")
# plot_calib_methods.remove("CT")
# plot_calib_methods.remove("PPA")
plot_calib_methods

['RF_d',
 'RF_opt',
 'RF_large',
 'Platt',
 'ISO',
 'Beta',
 'VA',
 'CT',
 'PPA',
 'Rank']

In [None]:
tables = cal.mean_and_ranking_table(calib_results_dict, 
                                    params["metrics"], 
                                    params["calib_methods"], 
                                    data_list, 
                                    mean_and_rank=False, 
                                    std=True)

path = f"results/{params['exp_name']}/calib_size_zoom"
if not os.path.exists(path):
    os.makedirs(path)

max_value = [None, 0.17, 0.1, 2, 0.15, 0.035]
min_value = [None, None, None, None, 0.02, None]

# max_value = [None, None, None, None, None, None]
# min_value = [None, None, None, None, None, None]


for metric, max_v, min_v in zip(params["metrics"], max_value, min_value): #
    ax = tables[metric][plot_calib_methods].plot(color=params["calib_method_colors"])
    if metric == "acc":
        metric_p = "ACC"
    elif metric == "logloss":
        metric_p = "LogLoss"
    elif metric == "ece":
        metric_p = "ECE"
    elif metric == "brier":
        metric_p = "Brier"
    elif metric == "tce_kl":
        metric_p = "TCE_KL"
    elif metric == "tce_mse":
        metric_p = "TCE"
    ax.set_xlabel("Calibration set size %")
    ax.set_ylabel(metric_p)
    plt.ylim(min_v, max_v)
    plt.savefig(f"{path}/cs_{metric}.pdf", format='pdf', transparent=True)
    plt.close() 


In [None]:
tables = cal.mean_and_ranking_table(calib_results_dict, 
                                    params["metrics"], 
                                    params["calib_methods"], 
                                    data_list, 
                                    mean_and_rank=True, 
                                    std=True)


In [None]:
ct.res_statistics(tables, params["metrics"], f"results/{params['exp_name']}", colors=params["calib_method_colors"])

metric acc
Friedman Test Statistic: 152.68899666565636
P-value: 2.444477395975267e-28
The differences between groups are significant.
metric brier
Friedman Test Statistic: 160.11272727272728
P-value: 7.036656859831195e-30
The differences between groups are significant.
metric ece
Friedman Test Statistic: 152.07272727272732
P-value: 3.2805150507547615e-28
The differences between groups are significant.
metric logloss
Friedman Test Statistic: 165.09818181818184
P-value: 6.468633066334181e-31
The differences between groups are significant.
metric tce_kl
Friedman Test Statistic: 161.44363636363641
P-value: 3.722087046954596e-30
The differences between groups are significant.
metric tce_mse
Friedman Test Statistic: 159.75272727272738
P-value: 8.359136966892309e-30
The differences between groups are significant.
metric prob_ent
Friedman Test Statistic: 178.07809640024416
P-value: 1.2762433665033538e-33
The differences between groups are significant.


In [None]:
ct.save_metrics_to_latex(tables, params["metrics"], params['exp_name'])

  tables[metric].round(5).to_latex(f"{path}/{metric}.txt")
  tables[metric].round(5).to_latex(f"{path}/{metric}.txt")
  tables[metric].round(5).to_latex(f"{path}/{metric}.txt")
  tables[metric].round(5).to_latex(f"{path}/{metric}.txt")
  tables[metric].round(5).to_latex(f"{path}/{metric}.txt")
  tables[metric].round(5).to_latex(f"{path}/{metric}.txt")
  tables[metric].round(5).to_latex(f"{path}/{metric}.txt")
  tables[metric+ "_std"].round(5).to_latex(f"{path}/{metric}_std.txt")
  tables[metric+ "_std"].round(5).to_latex(f"{path}/{metric}_std.txt")
  tables[metric+ "_std"].round(5).to_latex(f"{path}/{metric}_std.txt")
  tables[metric+ "_std"].round(5).to_latex(f"{path}/{metric}_std.txt")
  tables[metric+ "_std"].round(5).to_latex(f"{path}/{metric}_std.txt")
  tables[metric+ "_std"].round(5).to_latex(f"{path}/{metric}_std.txt")
  tables[metric+ "_std"].round(5).to_latex(f"{path}/{metric}_std.txt")
