In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import gc

from tabpfn import TabPFNClassifier
from sklearn import preprocessing
from sklearn import impute

import optuna
from optuna.visualization import (
    plot_contour
    , plot_edf
    , plot_intermediate_values
    , plot_optimization_history
    , plot_parallel_coordinate
    , plot_param_importances
    , plot_slice
)

pd.set_option('display.max_columns', None)


import plotly.express as px
import plotly.io as pio
pio.renderers.default = "png"

***
## loading data

In [None]:
input_path = "../data/raw"

train = pd.read_csv(f"{input_path}/train.csv")
test  = pd.read_csv(f"{input_path}/test.csv")
greeks = pd.read_csv(f"{input_path}/greeks.csv")

train.columns = [col.strip() for col in train.columns]
test.columns = [col.strip() for col in test.columns]

# available features
input_cols = train.columns[1:-1]
categ_cols = ["EJ"]

# we extend train with dummies from greeks
dummies = pd.get_dummies(greeks[["Alpha","Beta","Gamma","Delta"]])
train[dummies.columns] = dummies

# encode of categorical features
encoder = preprocessing.LabelEncoder().fit(train["EJ"])
train["EJ"] = encoder.transform(train["EJ"]).astype(int)
test["EJ"] = encoder.transform(test["EJ"]).astype(int)

display(train)

In [None]:
imputer = impute.SimpleImputer(strategy="median")
imputer.fit(train[input_cols])
train[input_cols] = imputer.transform(train[input_cols])
test[input_cols] = imputer.transform(test[input_cols])

#scaler = preprocessing.MaxAbsScaler()
#scaler.fit(train[input_cols])
#train[input_cols] = scaler.transform(train[input_cols])
#test[input_cols] = scaler.transform(test[input_cols])

In [None]:
repeated_cv_split = joblib.load("../data/iarc-data-split/repeated_5fold_cv_split_4tuning.pkl")
print(len(repeated_cv_split))

# number of repetitions to use
REPETITIONS = 10

***
## training

In [None]:
def balanced_logloss_(y_pred, y_true):
    n0 = np.sum(1-y_true)
    n1 = np.sum(y_true)
    p1 = np.clip(y_pred, 1e-15, 1-1e-15)
    p0 = 1-p1
    log_loss0 = - np.sum((1-y_true) * np.log(p0)) / n0
    log_loss1 = - np.sum(y_true * np.log(p1)) / n1
    return (log_loss0 + log_loss1)/2

#def balanced_logloss(y_pred: np.ndarray, data: lgb.Dataset):
#    y_true = data.get_label()
#    return 'balanced_logloss', balanced_logloss_(y_pred, y_true), False 

In [None]:
pct = train.Class.value_counts(normalize=True)
scale_pos_weight = pct[0]/pct[1]
print("scale_pos_weight:", scale_pos_weight)

cnt = train.Class.value_counts(normalize=False)
neg_bagging_fraction = cnt[1]/cnt[0]
print("neg_bagging_fraction:", neg_bagging_fraction)

In [None]:
def train_validate(
        dataframe,
        input_cols, 
        model_params,
        repeated_cv_split,
        n_repetitions=REPETITIONS,
        verbose=False,
        scale_probs=False,
    ):

    metrics = list()
    model_params = dict(model_params)

    for repeat in range(n_repetitions):
        if verbose:
            print(f"REPEAT NUMBER: {repeat+1}/{n_repetitions}")
        cv_split = repeated_cv_split[f"repeat_{repeat}"]
        n_folds = len(cv_split)
        
        for split in cv_split:
            fold = split["fold"]
            train_idx = split["train_idx"]
            valid_idx = split["valid_idx"]
            if verbose:
                print(f"training model for fold: {fold+1}/{n_folds}")
        
            train_df = dataframe.loc[train_idx,:].reset_index(drop=True)
            valid_df = dataframe.loc[valid_idx,:].reset_index(drop=True)

            clf = TabPFNClassifier(**model_params)
            clf.fit(
                train_df[input_cols].values, 
                train_df["Class"].values, 
                overwrite_warning=True
            )
            y_pred = clf.predict_proba(valid_df[input_cols].values)

            if scale_probs:
                y_pred = (y_pred / np.sum(y_pred, axis=0))
                y_pred = (y_pred / np.sum(y_pred, axis=1, keepdims=1))

            metrics.append( balanced_logloss_(y_pred[:,1], valid_df["Class"].values) )
    
    return np.mean(metrics), np.std(metrics)


def objective(trial):
    model_params = dict(
        N_ensemble_configurations = 2**trial.suggest_int("N_ensemble_configurations_exp", 2, 7),
        no_preprocess_mode = trial.suggest_categorical("no_preprocess_mode", [True, False]),
        multiclass_decoder = trial.suggest_categorical("multiclass_decoder", ["permutation", ""]),
        feature_shift_decoder = trial.suggest_categorical("feature_shift_decoder", [True, False]),
        scale_probs = trial.suggest_categorical("scale_probs", [True, False]),
    )
    scale_probs = model_params.pop("scale_probs")
    
    metric_mean, metric_std = train_validate(
        dataframe = train,
        input_cols = input_cols,
        model_params = model_params,
        repeated_cv_split = repeated_cv_split,
        n_repetitions = REPETITIONS,
        verbose = False,
        scale_probs = scale_probs,
    )
    
    return metric_mean

In [None]:
%%time

train_validate(
    dataframe = train,
    input_cols = input_cols,
    model_params = dict(N_ensemble_configurations=4, ),
    repeated_cv_split = repeated_cv_split,
    n_repetitions = REPETITIONS,
    verbose = False,
    scale_probs = True,
)

In [None]:
do_optimize = True

search_space = {
    'N_ensemble_configurations_exp': [2,3,4,5,6,7],
    'no_preprocess_mode': [True, False],
    "multiclass_decoder": ["permutation", ""],
    "feature_shift_decoder": [True, False],
    "scale_probs": [True, False],
}
study = optuna.create_study(
    study_name="iarc-tabpfn",
    direction='minimize',
    storage='sqlite:///iarc-tabpfn.db',
    load_if_exists=True,
    sampler=optuna.samplers.GridSampler(search_space),
)

if do_optimize:
    study.optimize(
        objective, 
        n_trials=10_000, 
        timeout=43200, # 12 hours
        n_jobs=1, 
        gc_after_trial=True,
    ) 

In [None]:
study.trials_dataframe().sort_values("value", ascending=True).head(20)

In [None]:
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_slice(study)

In [None]:
plot_edf(study)

In [None]:
plot_parallel_coordinate(study)

In [None]:
best_params = dict(study.best_params)
best_params["N_ensemble_configurations"] = 2**best_params.pop("N_ensemble_configurations_exp")
best_params

***