In [14]:
# How changing the dataset sample size effects the calibration methods
# Fix training dataset size and change the calib set samples - best method is one that gets max calib with least data

In [15]:
# imports
import sys
import pandas as pd
import numpy as np
sys.path.append('../../') # to access the files in higher directories
sys.path.append('../') # to access the files in higher directories
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import Data.data_provider as dp
import core as cal
from estimators.IR_RF_estimator import IR_RF

In [16]:
# params
calib_methods = ["RF", "Platt" , "ISO", "Rank", "CRF", "VA", "Beta", "Elkan", "tlr"]
metrics = ["acc", "auc", "brier", "ece", "logloss"]
data_list = ["spambase", "climate", "QSAR", "bank", "climate", "parkinsons", "vertebral", "ionosphere", "diabetes", "breast", "blod"]
# data_list = ["spambase", "climate"]

params = {
    "runs": 50,
    "n_estimators": 10,
    "oob": False,
    "test_split": 0.3,
    "calib_split": 0.5
}

In [17]:
calib_results_dict = {}

for data_name in data_list:

    # Data
    X, y = dp.load_data(data_name, "../../")
    
    data_dict = {} # results for each data set will be saved in here.
    for seed in range(params["runs"]): # running the same dataset multiple times
        # split the data
        data = cal.split_train_calib_test(data_name, X, y, params["test_split"], params["calib_split"], seed)

        # train model
        irrf = IR_RF(n_estimators=params["n_estimators"], oob_score=params["oob"], random_state=seed)
        irrf.fit(data["x_train"], data["y_train"])

        # calibration
        res = cal.calibration(irrf, data, calib_methods, metrics) # res is a dict with all the metrics results as well as RF probs and every calibration method decision for every test data point
        data_dict = cal.update_runs(data_dict, res) # calib results for every run for the same dataset is aggregated in data_dict (ex. acc of every run as an array)
    calib_results_dict.update(data_dict) # merge results of all datasets together
    
tables = cal.mean_and_ranking_table(calib_results_dict, metrics, calib_methods, data_list, mean_and_rank=False)

In [20]:
tables = cal.mean_and_ranking_table(calib_results_dict, metrics, calib_methods, data_list, mean_and_rank=True)

In [27]:
tables["brier"]

Unnamed: 0_level_0,RF,Platt,ISO,Rank,CRF,VA,Beta,Elkan,tlr
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
spambase,0.057521,0.050282,0.050778,0.053053,0.052633,0.050856,0.050744,0.057521,0.057811
climate,0.065902,0.065348,0.06704,0.089403,0.065331,0.065894,0.065269,0.065902,0.080928
QSAR,0.118648,0.117039,0.119126,0.123763,0.117443,0.119113,0.11698,0.118648,0.126369
bank,0.018635,0.01438,0.014587,0.029375,0.01539,0.015777,0.014548,0.018635,0.016045
climate,0.065902,0.065348,0.06704,0.089403,0.065331,0.065894,0.065269,0.065902,0.080928
parkinsons,0.115106,0.113709,0.116794,0.162676,0.115121,0.11939,0.113658,0.115106,0.142414
vertebral,0.125767,0.127597,0.133597,0.146293,0.125574,0.134638,0.127561,0.125774,0.132998
ionosphere,0.079163,0.067671,0.069951,0.113383,0.071535,0.072217,0.068081,0.079079,0.074572
diabetes,0.176628,0.17686,0.181247,0.177887,0.176833,0.181449,0.176869,0.176622,0.193212
breast,0.041661,0.040443,0.04307,0.063493,0.040895,0.044838,0.040796,0.041661,0.043156


In [19]:
# exp_dict = {} #pd.DataFrame()
# for metric in metrics:
#     exp_dict[metric] = pd.DataFrame()

# for exp_sample_size in params["calib_size"]:

#     calib_results_dict = {}
#     for data_name in data_list:
#         # Data
#         X, y = dp.load_data(data_name, "../../")
#         # X, y, tp = dp.make_classification_gaussian_with_true_prob(30, 4, 0)
#         # split to train calib test

#         data_dict = {}
#         for seed in range(params["runs"]):
#             # split the data
#             data = cal.split_train_calib_test(data_name, X, y, params["test_split"], params["calib_split"], seed)
#             # reset the calibration set size based on exp_sample_size percentage (for this experiment)
#             calib_size = int(exp_sample_size / 100 * len(data["x_calib"]))
#             for start_index in range(len(data["x_calib"]) - calib_size): # the for is to find a subset of calib data such that it contains all the class lables
#                 if len(np.unique(data["y_calib"][start_index : start_index+calib_size])) > 1: 
#                     data["x_calib"] = data["x_calib"][start_index : start_index+calib_size]
#                     data["y_calib"] = data["y_calib"][start_index : start_index+calib_size]
#                     break
#             # train model
#             irrf = IR_RF(n_estimators=params["n_estimators"], oob_score=params["oob"], random_state=seed)
#             irrf.fit(data["x_train"], data["y_train"])
#             # calibration
#             res = cal.calibration(irrf, data, calib_methods, metrics)
#             # print("run res\n", res)
#             data_dict = cal.update_runs(data_dict, res)

#         calib_results_dict.update(data_dict) # merge results of all datasets together

#     tables = cal.mean_and_ranking_table(calib_results_dict, metrics, calib_methods, data_list)
#     # print("tables", tables)
#     # exit()
#     exp_dict = cal.exp_mean_rank_through_time(exp_dict, tables, exp_sample_size, "rank", "Calibration sample size")

#     # calib_ranks = table.iloc[-1].to_dict()
#     # calib_ranks["Calibration sample size"] = exp_sample_size
#     # exp_dict = pd.concat([exp_dict, (pd.DataFrame([calib_ranks]))])
#     # print(f"exp_sample_size {exp_sample_size} done")
