In [None]:
import numpy as np
import pandas as pd
import joblib
from functools import partial
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn import preprocessing
from sklearn import impute

import optuna
from optuna.visualization import (
    plot_contour
    , plot_edf
    , plot_intermediate_values
    , plot_optimization_history
    , plot_parallel_coordinate
    , plot_param_importances
    , plot_slice
)

# custom modules
import sys
sys.path.append("../utils")

from preproc import load_data,scale_data
#from calibration1 import calibrate_probs,optimize_calibration
from calibration2 import calibrate_probs,optimize_calibration

In [None]:
def balanced_logloss_(y_pred, y_true, eps=1e-7):
    n0 = np.sum(1-y_true)
    n1 = np.sum(y_true)
    p1 = np.clip(y_pred, eps, 1-eps)
    p0 = 1-p1
    log_loss0 = - np.sum((1-y_true) * np.log(p0)) / (n0+eps)
    log_loss1 = - np.sum(y_true * np.log(p1)) / (n1+eps)
    return (log_loss0 + log_loss1)/2


def compute_overall_metric(oof_dfs:list) -> float:
    all_metrics = [
        balanced_logloss_(oof.pred_proba.values, oof.Class.values)
        for oof in oof_dfs
    ]
    return np.mean(all_metrics)


def calibrate_oof(oof_dfs, calib_params):
    oof_dfs_calibrated = list()
    
    for oof in oof_dfs:
        oof = oof.copy(deep=True)
        calib_p1 = calibrate_probs(
            oof.pred_proba.values,
            **calib_params
        )
        oof["pred_proba"] = calib_p1
        oof_dfs_calibrated.append(oof)
        
    return oof_dfs_calibrated

In [None]:
# notebook parameter

CALIBRATE_FIRST = True

***
# load data and predictions

### 1. `lgbm-gbrt`

In [None]:
calib_params_m01 = joblib.load("../data/iarc-lgbm-gbrt-bagging-balanced/calib_params.pkl")
oof_dfs_m01 = joblib.load("../data/iarc-lgbm-gbrt-bagging-balanced/oof_dataframes.pkl")

In [None]:
compute_overall_metric(oof_dfs_m01)

In [None]:
oof_dfs_calib_m01 = calibrate_oof(oof_dfs_m01, calib_params_m01)
compute_overall_metric(oof_dfs_calib_m01)

### 2. `lgbm-linear`

In [None]:
calib_params_m02 = joblib.load("../data/iarc-lgbm-linear-bagging-balanced/calib_params.pkl")
oof_dfs_m02 = joblib.load("../data/iarc-lgbm-linear-bagging-balanced/oof_dataframes.pkl")

In [None]:
compute_overall_metric(oof_dfs_m02)

In [None]:
oof_dfs_calib_m02 = calibrate_oof(oof_dfs_m02, calib_params_m02)
compute_overall_metric(oof_dfs_calib_m02)

### 3. `catboost`

In [None]:
calib_params_m03 = joblib.load("../data/iarc-catboost-weight-balanced/calib_params.pkl")
oof_dfs_m03 = joblib.load("../data/iarc-catboost-weight-balanced/oof_dataframes.pkl")

In [None]:
compute_overall_metric(oof_dfs_m03)

In [None]:
oof_dfs_calib_m03 = calibrate_oof(oof_dfs_m03, calib_params_m03)
compute_overall_metric(oof_dfs_calib_m03)

### 4. `tabpfn`

In [None]:
calib_params_m04 = joblib.load("../data/iarc-tabpfn/calib_params.pkl")
oof_dfs_m04 = joblib.load("../data/iarc-tabpfn/oof_dataframes.pkl")

In [None]:
compute_overall_metric(oof_dfs_m04)

In [None]:
oof_dfs_calib_m04 = calibrate_oof(oof_dfs_m04, calib_params_m04)
compute_overall_metric(oof_dfs_calib_m04)

### 5. `multiout-mlp`

In [None]:
calib_params_m05 = joblib.load("../data/iarc-multiout-tf-mlp/calib_params.pkl")
oof_dfs_m05 = joblib.load("../data/iarc-multiout-tf-mlp/oof_dataframes.pkl")

In [None]:
compute_overall_metric(oof_dfs_m05)

In [None]:
oof_dfs_calib_m05 = calibrate_oof(oof_dfs_m05, calib_params_m05)
compute_overall_metric(oof_dfs_calib_m05)

### 6. `xgb-gblinear`

In [None]:
calib_params_m06 = joblib.load("../data/iarc-xgb-gblinear/calib_params.pkl")
oof_dfs_m06 = joblib.load("../data/iarc-xgb-gblinear/oof_dataframes.pkl")

In [None]:
compute_overall_metric(oof_dfs_m06)

In [None]:
oof_dfs_calib_m06 = calibrate_oof(oof_dfs_m06, calib_params_m06)
compute_overall_metric(oof_dfs_calib_m06)

### 7. xgb-gbtree

In [None]:
calib_params_m07 = joblib.load("../data/iarc-xgb-gbtree/calib_params.pkl")
oof_dfs_m07 = joblib.load("../data/iarc-xgb-gbtree/oof_dataframes.pkl")

In [None]:
compute_overall_metric(oof_dfs_m07)

In [None]:
oof_dfs_calib_m07 = calibrate_oof(oof_dfs_m07, calib_params_m07)
compute_overall_metric(oof_dfs_calib_m07)

***
## prepares data

In [None]:
# models to consider

if not CALIBRATE_FIRST:
    print("using non calibrated")
    oof_all = [
        oof_dfs_m01,
        oof_dfs_m02,
        oof_dfs_m03,
        oof_dfs_m04,
        oof_dfs_m05,
        oof_dfs_m06,
        oof_dfs_m07,
    ]
else:
    print("using calibrated")
    oof_all = [
        oof_dfs_calib_m01,
        oof_dfs_calib_m02,
        oof_dfs_calib_m03,
        oof_dfs_calib_m04,
        oof_dfs_calib_m05,
        oof_dfs_calib_m06,
        oof_dfs_calib_m07,
    ]

for _oof_dfs in oof_all:
    print(compute_overall_metric(_oof_dfs))

In [None]:
input_path = "../data/raw"

train = pd.read_csv(f"{input_path}/train.csv")
test  = pd.read_csv(f"{input_path}/test.csv")
greeks = pd.read_csv(f"{input_path}/greeks.csv")

train.columns = [col.strip() for col in train.columns]
test.columns = [col.strip() for col in test.columns]

# available features
input_cols = train.columns[1:-1]
categ_cols = ["EJ"]

# we extend train with dummies from greeks
dummies = pd.get_dummies(greeks[["Alpha","Beta","Gamma","Delta"]])
train[dummies.columns] = dummies

# encode of categorical features
encoder = preprocessing.LabelEncoder().fit(train["EJ"])
train["EJ"] = encoder.transform(train["EJ"]).astype(int)
test["EJ"] = encoder.transform(test["EJ"]).astype(int)

# impute missing values
imputer = impute.SimpleImputer(strategy="median")
imputer.fit(train[input_cols])
train[input_cols] = imputer.transform(train[input_cols])
test[input_cols] = imputer.transform(test[input_cols])

# scale data
scaler = preprocessing.MaxAbsScaler()
scaler.fit(train[input_cols])
train[input_cols] = scaler.transform(train[input_cols])
test[input_cols] = scaler.transform(test[input_cols])

In [None]:
repeated_cv_split = joblib.load("../data/iarc-data-split/repeated_5fold_cv_split_4validation.pkl")
print(len(repeated_cv_split))

# number of repetitions to use
REPETITIONS = 10

In [None]:
pct = train.Class.value_counts(normalize=True)
scale_pos_weight = pct[0]/pct[1]
print("scale_pos_weight:", scale_pos_weight)

cnt = train.Class.value_counts(normalize=False)
neg_bagging_fraction = cnt[1]/cnt[0]
print("neg_bagging_fraction:", neg_bagging_fraction)

***
## training

In [None]:
def include_oof_preds(train, input_cols, oof_all, repeat_nbr):
    train = train.copy()
    input_cols = input_cols.tolist().copy()
    n_models = len(oof_all)
    
    for i in range(n_models):
        _oof = oof_all[i][repeat_nbr][["Id","pred_proba"]].rename({"pred_proba":f"pm{i+1}"}, axis=1)
        train = pd.merge(train, _oof)
        input_cols.append(f"pm{i+1}")
        
    return train,input_cols

In [None]:
def balanced_logloss_(y_pred, y_true):
    n0 = np.sum(1-y_true)
    n1 = np.sum(y_true)
    p1 = np.clip(y_pred, 1e-15, 1-1e-15)
    p0 = 1-p1
    log_loss0 = - np.sum((1-y_true) * np.log(p0)) / n0
    log_loss1 = - np.sum(y_true * np.log(p1)) / n1
    return (log_loss0 + log_loss1)/2

def balanced_logloss(y_pred: np.ndarray, data: lgb.Dataset):
    y_true = data.get_label()
    return 'balanced_logloss', balanced_logloss_(y_pred, y_true), False 

In [None]:
balance_method = "bagging"

DEFAULT_PARAMS = {
    'objective': 'binary',
    'metric': 'None',
    'learning_rate': 0.005,
    'bagging_freq': 1,
    'seed': 2112,
    'first_metric_only': False,
    'feature_pre_filter': False,
    'verbosity': -1,
    'linear_tree':True,
    'n_jobs':8,
}

if balance_method == "weight":
    DEFAULT_PARAMS["scale_pos_weight"] = scale_pos_weight
elif balance_method == "bagging":
    DEFAULT_PARAMS["bagging_freq"] = 1
    DEFAULT_PARAMS["pos_bagging_fraction"] = 1
    DEFAULT_PARAMS["neg_bagging_fraction"] = neg_bagging_fraction
else:
    print("Unknown balance_method")
    
display(DEFAULT_PARAMS)

In [None]:
def train_validate(
        dataframe,
        input_cols, 
        model_params,
        repeated_cv_split,
        n_repetitions=REPETITIONS,
        verbose=False,
    ):

    metrics = list()
    model_params = dict(model_params)
    num_iterations = (
        1000 if "num_iterations" not in model_params.keys() 
        else model_params.pop("num_iterations")
    )

    for repeat in range(n_repetitions):
        if verbose:
            print(f"REPEAT NUMBER: {repeat+1}/{n_repetitions}")
        cv_split = repeated_cv_split[f"repeat_{repeat}"]
        n_folds = len(cv_split)
        
        for split in cv_split:
            fold = split["fold"]
            train_idx = split["train_idx"]
            valid_idx = split["valid_idx"]
            if verbose:
                print(f"training model for fold: {fold+1}/{n_folds}")

            _train,_input_cols = include_oof_preds(train, input_cols, oof_all, repeat)

            train_df = _train.loc[train_idx,:].reset_index(drop=True)
            valid_df = _train.loc[valid_idx,:].reset_index(drop=True)

            train_dset = lgb.Dataset(
                data=train_df.loc[:,_input_cols],
                label=train_df.loc[:,"Class"].values,
                free_raw_data=False
            )
            model = lgb.train(
                params=model_params,
                train_set=train_dset,
                num_boost_round=num_iterations,
            )
            
            y_pred = model.predict(valid_df.loc[:,_input_cols])
            metrics.append( balanced_logloss_(y_pred, valid_df.loc[:,"Class"].values) )
    
    return np.mean(metrics), np.std(metrics)


def objective(trial):
    sampled_params = dict(
        # general booster config
        max_bin = 2**trial.suggest_int("max_bin_exp", 3, 8) - 1,
        num_leaves = 2**trial.suggest_int("num_leaves_exp", 2, 7) - 1,
        num_iterations = trial.suggest_int("num_iterations", 100, 2000),
        # regularization
        feature_fraction = trial.suggest_float("feature_fraction", 0.2, 1.0, step=0.05),
        min_data_in_leaf = trial.suggest_int("min_data_in_leaf", 5, 100),
        
        lambda_l1 = trial.suggest_float("lambda_l1", 1e-10, 1e2, log=True),
        lambda_l2 = trial.suggest_float("lambda_l2", 1e-10, 1e3, log=True),
        path_smooth = trial.suggest_float("path_smooth", 1e-10, 1e2, log=True),
        min_gain_to_split = trial.suggest_float("min_gain_to_split", 1e-10, 1e1, log=True),

        # linear tree regularization parameter
        linear_lambda = trial.suggest_float("linear_lambda", 1e-10, 1e2, log=True),        
    )
    model_params = {**DEFAULT_PARAMS, **sampled_params}
    
    metric_mean, metric_std = train_validate(
        dataframe = train,
        input_cols = input_cols,
        model_params = model_params,
        repeated_cv_split = repeated_cv_split,
        n_repetitions = REPETITIONS,
        verbose = False,
    )
    
    return metric_mean

In [None]:
%%time

train_validate(
    dataframe = train,
    input_cols = input_cols,
    model_params = DEFAULT_PARAMS,
    repeated_cv_split = repeated_cv_split,
    n_repetitions = REPETITIONS,
    verbose = False
    
)

In [None]:
do_optimize = True

study = optuna.create_study(
    study_name="iarc-stacking-lgbm-linear-v2-try2",
    direction='minimize',
    storage='sqlite:///iarc-stacking-lgbm-linear-v2-try2.db',
    load_if_exists=True,
)

if do_optimize:
    study.optimize(
        objective, 
        n_trials=10_000, 
        timeout=21600, # 6 hours
        n_jobs=1, 
        gc_after_trial=True,
    ) 

In [None]:
study.trials_dataframe().sort_values("value", ascending=True).head(20)

In [None]:
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_slice(study)

In [None]:
best_params = dict(study.best_params)
best_params["max_bin"] = 2**best_params.pop("max_bin_exp")-1
best_params["num_leaves"] = 2**best_params.pop("num_leaves_exp")-1
best_params = {**DEFAULT_PARAMS, **best_params}
best_params

***