In [1]:
import numpy as np
import pandas as pd
import joblib
from functools import partial
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn import preprocessing
from sklearn import impute

import optuna
from optuna.visualization import (
    plot_contour
    , plot_edf
    , plot_intermediate_values
    , plot_optimization_history
    , plot_parallel_coordinate
    , plot_param_importances
    , plot_slice
)

# custom modules
import sys
sys.path.append("../utils")

from preproc import load_data,scale_data
#from calibration1 import calibrate_probs,optimize_calibration
from calibration2 import calibrate_probs,optimize_calibration

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def balanced_logloss_(y_pred, y_true, eps=1e-7):
    n0 = np.sum(1-y_true)
    n1 = np.sum(y_true)
    p1 = np.clip(y_pred, eps, 1-eps)
    p0 = 1-p1
    log_loss0 = - np.sum((1-y_true) * np.log(p0)) / (n0+eps)
    log_loss1 = - np.sum(y_true * np.log(p1)) / (n1+eps)
    return (log_loss0 + log_loss1)/2


def compute_overall_metric(oof_dfs:list) -> float:
    all_metrics = [
        balanced_logloss_(oof.pred_proba.values, oof.Class.values)
        for oof in oof_dfs
    ]
    return np.mean(all_metrics)


def calibrate_oof(oof_dfs, calib_params):
    oof_dfs_calibrated = list()
    
    for oof in oof_dfs:
        oof = oof.copy(deep=True)
        calib_p1 = calibrate_probs(
            oof.pred_proba.values,
            **calib_params
        )
        oof["pred_proba"] = calib_p1
        oof_dfs_calibrated.append(oof)
        
    return oof_dfs_calibrated

In [3]:
# notebook parameter

CALIBRATE_FIRST = True

***
# load data and predictions

### 1. `lgbm-gbrt`

In [4]:
calib_params_m01 = joblib.load("../data/iarc-lgbm-gbrt-bagging-balanced/calib_params.pkl")
oof_dfs_m01 = joblib.load("../data/iarc-lgbm-gbrt-bagging-balanced/oof_dataframes.pkl")

In [5]:
compute_overall_metric(oof_dfs_m01)

0.24033928714260183

In [6]:
oof_dfs_calib_m01 = calibrate_oof(oof_dfs_m01, calib_params_m01)
compute_overall_metric(oof_dfs_calib_m01)

0.22139398049564257

### 2. `lgbm-linear`

In [7]:
calib_params_m02 = joblib.load("../data/iarc-lgbm-linear-bagging-balanced/calib_params.pkl")
oof_dfs_m02 = joblib.load("../data/iarc-lgbm-linear-bagging-balanced/oof_dataframes.pkl")

In [8]:
compute_overall_metric(oof_dfs_m02)

0.23776284612591142

In [9]:
oof_dfs_calib_m02 = calibrate_oof(oof_dfs_m02, calib_params_m02)
compute_overall_metric(oof_dfs_calib_m02)

0.21870748528950434

### 3. `catboost`

In [10]:
calib_params_m03 = joblib.load("../data/iarc-catboost-weight-balanced/calib_params.pkl")
oof_dfs_m03 = joblib.load("../data/iarc-catboost-weight-balanced/oof_dataframes.pkl")

In [11]:
compute_overall_metric(oof_dfs_m03)

0.2633244023340282

In [12]:
oof_dfs_calib_m03 = calibrate_oof(oof_dfs_m03, calib_params_m03)
compute_overall_metric(oof_dfs_calib_m03)

0.22835158860669114

### 4. `tabpfn`

In [13]:
calib_params_m04 = joblib.load("../data/iarc-tabpfn/calib_params.pkl")
oof_dfs_m04 = joblib.load("../data/iarc-tabpfn/oof_dataframes.pkl")

In [14]:
compute_overall_metric(oof_dfs_m04)

0.3900050155836857

In [15]:
oof_dfs_calib_m04 = calibrate_oof(oof_dfs_m04, calib_params_m04)
compute_overall_metric(oof_dfs_calib_m04)

0.28839025101520327

### 5. `multiout-mlp`

In [16]:
calib_params_m05 = joblib.load("../data/iarc-multiout-tf-mlp/calib_params.pkl")
oof_dfs_m05 = joblib.load("../data/iarc-multiout-tf-mlp/oof_dataframes.pkl")

In [17]:
compute_overall_metric(oof_dfs_m05)

0.30563847176445524

In [18]:
oof_dfs_calib_m05 = calibrate_oof(oof_dfs_m05, calib_params_m05)
compute_overall_metric(oof_dfs_calib_m05)

0.2904142458122567

### 6. `xgb-gblinear`

In [19]:
calib_params_m06 = joblib.load("../data/iarc-xgb-gblinear/calib_params.pkl")
oof_dfs_m06 = joblib.load("../data/iarc-xgb-gblinear/oof_dataframes.pkl")

In [20]:
compute_overall_metric(oof_dfs_m06)

0.37894304572238047

In [21]:
oof_dfs_calib_m06 = calibrate_oof(oof_dfs_m06, calib_params_m06)
compute_overall_metric(oof_dfs_calib_m06)

0.36502436920847003

### 7. xgb-gbtree

In [22]:
calib_params_m07 = joblib.load("../data/iarc-xgb-gbtree/calib_params.pkl")
oof_dfs_m07 = joblib.load("../data/iarc-xgb-gbtree/oof_dataframes.pkl")

In [23]:
compute_overall_metric(oof_dfs_m07)

0.27239995042577647

In [24]:
oof_dfs_calib_m07 = calibrate_oof(oof_dfs_m07, calib_params_m07)
compute_overall_metric(oof_dfs_calib_m07)

0.22564184381208174

***
## prepares data

In [25]:
# models to consider

if not CALIBRATE_FIRST:
    print("using non calibrated")
    oof_all = [
        oof_dfs_m01,
        oof_dfs_m02,
        oof_dfs_m03,
        oof_dfs_m04,
        oof_dfs_m05,
        oof_dfs_m06,
        oof_dfs_m07,
    ]
else:
    print("using calibrated")
    oof_all = [
        oof_dfs_calib_m01,
        oof_dfs_calib_m02,
        oof_dfs_calib_m03,
        oof_dfs_calib_m04,
        oof_dfs_calib_m05,
        oof_dfs_calib_m06,
        oof_dfs_calib_m07,
    ]

for _oof_dfs in oof_all:
    print(compute_overall_metric(_oof_dfs))

using calibrated
0.22139398049564257
0.21870748528950434
0.22835158860669114
0.28839025101520327
0.2904142458122567
0.36502436920847003
0.22564184381208174


In [26]:
input_path = "../data/raw"

train = pd.read_csv(f"{input_path}/train.csv")
test  = pd.read_csv(f"{input_path}/test.csv")
greeks = pd.read_csv(f"{input_path}/greeks.csv")

train.columns = [col.strip() for col in train.columns]
test.columns = [col.strip() for col in test.columns]

# available features
input_cols = train.columns[1:-1]
categ_cols = ["EJ"]

# we extend train with dummies from greeks
dummies = pd.get_dummies(greeks[["Alpha","Beta","Gamma","Delta"]])
train[dummies.columns] = dummies

# encode of categorical features
encoder = preprocessing.LabelEncoder().fit(train["EJ"])
train["EJ"] = encoder.transform(train["EJ"]).astype(int)
test["EJ"] = encoder.transform(test["EJ"]).astype(int)

# impute missing values
imputer = impute.SimpleImputer(strategy="median")
imputer.fit(train[input_cols])
train[input_cols] = imputer.transform(train[input_cols])
test[input_cols] = imputer.transform(test[input_cols])

# scale data
scaler = preprocessing.MaxAbsScaler()
scaler.fit(train[input_cols])
train[input_cols] = scaler.transform(train[input_cols])
test[input_cols] = scaler.transform(test[input_cols])

In [27]:
repeated_cv_split = joblib.load("../data/iarc-data-split/repeated_5fold_cv_split_4tuning.pkl")
print(len(repeated_cv_split))

# number of repetitions to use
REPETITIONS = 10

10


In [28]:
pct = train.Class.value_counts(normalize=True)
scale_pos_weight = pct[0]/pct[1]
print("scale_pos_weight:", scale_pos_weight)

cnt = train.Class.value_counts(normalize=False)
neg_bagging_fraction = cnt[1]/cnt[0]
print("neg_bagging_fraction:", neg_bagging_fraction)

scale_pos_weight: 4.712962962962963
neg_bagging_fraction: 0.21218074656188604


***
## training

In [29]:
def include_oof_preds(train, input_cols, oof_all, repeat_nbr):
    train = train.copy()
    input_cols = input_cols.tolist().copy()
    n_models = len(oof_all)
    
    for i in range(n_models):
        _oof = oof_all[i][repeat_nbr][["Id","pred_proba"]].rename({"pred_proba":f"pm{i+1}"}, axis=1)
        train = pd.merge(train, _oof)
        input_cols.append(f"pm{i+1}")
        
    return train,input_cols

In [30]:
def balanced_logloss_(y_pred, y_true):
    n0 = np.sum(1-y_true)
    n1 = np.sum(y_true)
    p1 = np.clip(y_pred, 1e-15, 1-1e-15)
    p0 = 1-p1
    log_loss0 = - np.sum((1-y_true) * np.log(p0)) / n0
    log_loss1 = - np.sum(y_true * np.log(p1)) / n1
    return (log_loss0 + log_loss1)/2

def balanced_logloss(y_pred: np.ndarray, data: lgb.Dataset):
    y_true = data.get_label()
    return 'balanced_logloss', balanced_logloss_(y_pred, y_true), False 

In [31]:
balance_method = "bagging"

DEFAULT_PARAMS = {
    'objective': 'binary',
    'metric': 'None',
    'learning_rate': 0.005,
    'bagging_freq': 1,
    'seed': 2112,
    'first_metric_only': False,
    'feature_pre_filter': False,
    'verbosity': -1,
    'linear_tree':True,
    'n_jobs':8,
}

if balance_method == "weight":
    DEFAULT_PARAMS["scale_pos_weight"] = scale_pos_weight
elif balance_method == "bagging":
    DEFAULT_PARAMS["bagging_freq"] = 1
    DEFAULT_PARAMS["pos_bagging_fraction"] = 1
    DEFAULT_PARAMS["neg_bagging_fraction"] = neg_bagging_fraction
else:
    print("Unknown balance_method")
    
display(DEFAULT_PARAMS)

{'objective': 'binary',
 'metric': 'None',
 'learning_rate': 0.005,
 'bagging_freq': 1,
 'seed': 2112,
 'first_metric_only': False,
 'feature_pre_filter': False,
 'verbosity': -1,
 'linear_tree': True,
 'n_jobs': 8,
 'pos_bagging_fraction': 1,
 'neg_bagging_fraction': 0.21218074656188604}

In [32]:
def train_validate(
        dataframe,
        input_cols, 
        model_params,
        repeated_cv_split,
        n_repetitions=REPETITIONS,
        verbose=False,
    ):

    metrics = list()
    model_params = dict(model_params)
    num_iterations = (
        1000 if "num_iterations" not in model_params.keys() 
        else model_params.pop("num_iterations")
    )

    for repeat in range(n_repetitions):
        if verbose:
            print(f"REPEAT NUMBER: {repeat+1}/{n_repetitions}")
        cv_split = repeated_cv_split[f"repeat_{repeat}"]
        n_folds = len(cv_split)
        
        for split in cv_split:
            fold = split["fold"]
            train_idx = split["train_idx"]
            valid_idx = split["valid_idx"]
            if verbose:
                print(f"training model for fold: {fold+1}/{n_folds}")

            _train,_input_cols = include_oof_preds(train, input_cols, oof_all, repeat)

            train_df = _train.loc[train_idx,:].reset_index(drop=True)
            valid_df = _train.loc[valid_idx,:].reset_index(drop=True)

            train_dset = lgb.Dataset(
                data=train_df.loc[:,_input_cols],
                label=train_df.loc[:,"Class"].values,
                free_raw_data=False
            )
            model = lgb.train(
                params=model_params,
                train_set=train_dset,
                num_boost_round=num_iterations,
            )
            
            y_pred = model.predict(valid_df.loc[:,_input_cols])
            metrics.append( balanced_logloss_(y_pred, valid_df.loc[:,"Class"].values) )
    
    return np.mean(metrics), np.std(metrics)


def objective(trial):
    sampled_params = dict(
        # general booster config
        max_bin = 2**trial.suggest_int("max_bin_exp", 3, 8) - 1,
        num_leaves = 2**trial.suggest_int("num_leaves_exp", 2, 7) - 1,
        num_iterations = trial.suggest_int("num_iterations", 100, 2000),
        # regularization
        feature_fraction = trial.suggest_float("feature_fraction", 0.2, 1.0, step=0.05),
        min_data_in_leaf = trial.suggest_int("min_data_in_leaf", 5, 100),
        
        lambda_l1 = trial.suggest_float("lambda_l1", 1e-10, 1e2, log=True),
        lambda_l2 = trial.suggest_float("lambda_l2", 1e-10, 1e3, log=True),
        path_smooth = trial.suggest_float("path_smooth", 1e-10, 1e2, log=True),
        min_gain_to_split = trial.suggest_float("min_gain_to_split", 1e-10, 1e1, log=True),

        # linear tree regularization parameter
        linear_lambda = trial.suggest_float("linear_lambda", 1e-10, 1e2, log=True),        
    )
    model_params = {**DEFAULT_PARAMS, **sampled_params}
    
    metric_mean, metric_std = train_validate(
        dataframe = train,
        input_cols = input_cols,
        model_params = model_params,
        repeated_cv_split = repeated_cv_split,
        n_repetitions = REPETITIONS,
        verbose = False,
    )
    
    return metric_mean

In [33]:
%%time

train_validate(
    dataframe = train,
    input_cols = input_cols,
    model_params = DEFAULT_PARAMS,
    repeated_cv_split = repeated_cv_split,
    n_repetitions = REPETITIONS,
    verbose = False
    
)

CPU times: user 3min 7s, sys: 2.87 s, total: 3min 10s
Wall time: 26.3 s


(0.24797798211148084, 0.08358666127778804)

In [34]:
do_optimize = True

study = optuna.create_study(
    study_name="iarc-stacking-lgbm-linear-v2",
    direction='minimize',
    storage='sqlite:///iarc-stacking-lgbm-linear-v2.db',
    load_if_exists=True,
)

if do_optimize:
    study.optimize(
        objective, 
        n_trials=10_000, 
        timeout=32400, # 9 hours
        n_jobs=1, 
        gc_after_trial=True,
    ) 

In [35]:
study.trials_dataframe().sort_values("value", ascending=True).head(20)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_feature_fraction,params_lambda_l1,params_lambda_l2,params_linear_lambda,params_max_bin_exp,params_min_data_in_leaf,params_min_gain_to_split,params_num_iterations,params_num_leaves_exp,params_path_smooth,state
2774,2774,0.208556,2023-08-10 12:51:12.172949,2023-08-10 12:51:29.595039,0 days 00:00:17.422090,0.2,5e-06,5.326677,0.006258,3,34,2.793668e-05,1147,6,3.93023e-06,COMPLETE
2197,2197,0.208681,2023-08-10 09:55:35.397783,2023-08-10 09:55:53.312799,0 days 00:00:17.915016,0.2,0.005335,4.971394,0.00766,3,35,4.64089e-06,1188,6,1.01354e-05,COMPLETE
2030,2030,0.208691,2023-08-10 09:01:06.633008,2023-08-10 09:01:24.468025,0 days 00:00:17.835017,0.2,0.004254,5.125108,0.003127,3,34,1.214788e-06,1167,6,5.91147e-06,COMPLETE
2200,2200,0.208727,2023-08-10 09:56:30.083264,2023-08-10 09:56:47.549724,0 days 00:00:17.466460,0.2,0.000106,7.491208,0.016119,3,35,7.92166e-06,1171,6,8.209018e-06,COMPLETE
2439,2439,0.208742,2023-08-10 11:11:06.352734,2023-08-10 11:11:24.011973,0 days 00:00:17.659239,0.2,0.000287,3.926657,0.006019,3,36,2.457061e-10,1162,6,0.0008565883,COMPLETE
2488,2488,0.208746,2023-08-10 11:25:42.448562,2023-08-10 11:26:00.757363,0 days 00:00:18.308801,0.2,0.00069,5.252768,0.071341,3,36,4.4422e-10,1229,6,0.129233,COMPLETE
2765,2765,0.208765,2023-08-10 12:48:33.080814,2023-08-10 12:48:50.889132,0 days 00:00:17.808318,0.2,0.001158,5.369788,0.01121,3,34,1.773856e-05,1109,6,8.477793e-06,COMPLETE
2844,2844,0.208784,2023-08-10 13:12:25.159746,2023-08-10 13:12:42.928444,0 days 00:00:17.768698,0.2,0.002199,3.736916,0.002604,3,35,2.514756e-10,1172,6,1.827401e-05,COMPLETE
2719,2719,0.208794,2023-08-10 12:35:17.419480,2023-08-10 12:35:34.608407,0 days 00:00:17.188927,0.2,0.000662,8.197599,0.005121,3,36,1.697509e-10,1155,6,1.555252e-05,COMPLETE
2735,2735,0.208797,2023-08-10 12:39:50.503309,2023-08-10 12:40:07.969698,0 days 00:00:17.466389,0.2,0.000468,8.639307,0.010774,3,34,3.863148e-10,1124,6,1.235145e-05,COMPLETE


In [36]:
plot_optimization_history(study)

In [37]:
plot_param_importances(study)

In [38]:
plot_slice(study)

In [39]:
best_params = dict(study.best_params)
best_params["max_bin"] = 2**best_params.pop("max_bin_exp")-1
best_params["num_leaves"] = 2**best_params.pop("num_leaves_exp")-1
best_params = {**DEFAULT_PARAMS, **best_params}
best_params

{'objective': 'binary',
 'metric': 'None',
 'learning_rate': 0.005,
 'bagging_freq': 1,
 'seed': 2112,
 'first_metric_only': False,
 'feature_pre_filter': False,
 'verbosity': -1,
 'linear_tree': True,
 'n_jobs': 8,
 'pos_bagging_fraction': 1,
 'neg_bagging_fraction': 0.21218074656188604,
 'feature_fraction': 0.2,
 'lambda_l1': 4.529675783097431e-06,
 'lambda_l2': 5.326676788635167,
 'linear_lambda': 0.006258295192882317,
 'min_data_in_leaf': 34,
 'min_gain_to_split': 2.793667782173515e-05,
 'num_iterations': 1147,
 'path_smooth': 3.930230402046273e-06,
 'max_bin': 7,
 'num_leaves': 63}

***