In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import gc

from sklearn import preprocessing
from sklearn import impute
from sklearn import pipeline

import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.models import Model

import optuna
from optuna.visualization import (
    plot_contour
    , plot_edf
    , plot_intermediate_values
    , plot_optimization_history
    , plot_parallel_coordinate
    , plot_param_importances
    , plot_slice
)

pd.set_option('display.max_columns', None)

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "png"

In [None]:
np.random.seed(2112)
tf.random.set_seed(2112)

***
## loading data

In [None]:
input_path = "../data/raw"

train = pd.read_csv(f"{input_path}/train.csv")
test  = pd.read_csv(f"{input_path}/test.csv")
greeks = pd.read_csv(f"{input_path}/greeks.csv")

train.columns = [col.strip() for col in train.columns]
test.columns = [col.strip() for col in test.columns]

# available features
input_cols = train.columns[1:-1]
categ_cols = ["EJ"]

# we extend train with dummies from greeks
dummies = pd.get_dummies(greeks[["Alpha","Beta","Gamma","Delta"]])
train[dummies.columns] = dummies

# encode of categorical features
encoder = preprocessing.LabelEncoder().fit(train["EJ"])
train["EJ"] = encoder.transform(train["EJ"]).astype(int)
test["EJ"] = encoder.transform(test["EJ"]).astype(int)

display(train)

In [None]:
preproc_pipe = pipeline.Pipeline([
    ("imputer", impute.SimpleImputer(strategy="median")), 
    ("scaler", preprocessing.MaxAbsScaler()),
])

preproc_pipe.fit(train[input_cols])
display(preproc_pipe)

train[input_cols] = preproc_pipe.transform(train[input_cols])
test[input_cols] = preproc_pipe.transform(test[input_cols])

In [None]:
repeated_cv_split = joblib.load("../data/iarc-data-split/repeated_5fold_cv_split_4tuning.pkl")
print(len(repeated_cv_split))

# number of repetitions to use
REPETITIONS = 5

In [None]:
alpha_labels = ["Alpha_A", "Alpha_B", "Alpha_D", "Alpha_G"]
beta_labels = ["Beta_A","Beta_B","Beta_C"]
gamma_labels = ["Gamma_A","Gamma_B","Gamma_E","Gamma_F","Gamma_G","Gamma_H","Gamma_M","Gamma_N"]
delta_labels = ["Delta_A","Delta_B","Delta_C","Delta_D"]

In [None]:
pct = train.Class.value_counts(normalize=True)
scale_pos_weight = pct[0]/pct[1]
print("scale_pos_weight:", scale_pos_weight)

cnt = train.Class.value_counts(normalize=False)
neg_bagging_fraction = cnt[1]/cnt[0]
print("neg_bagging_fraction:", neg_bagging_fraction)

In [None]:
# create sample weight column
train["weight"] = 1.
idx = train.query("Class == 1").index
train.loc[idx,"weight"] = scale_pos_weight

***
## training

In [None]:
def create_model_instance(
        hidden_size=64, 
        dropout1=0.1, 
        dropout2=0.05,
        l2_lambda=1e-3,
        label_smoothing=0.01,
        activation="relu",
    ):
    input = tf.keras.Input(shape=(56,))
    x = tf.keras.layers.Dropout(dropout1)(input)
    x = tf.keras.layers.Dense(
        hidden_size, 
        activation=activation, 
        kernel_initializer=tf.keras.initializers.GlorotUniform(seed=2112),
        kernel_regularizer=tf.keras.regularizers.l2(l2_lambda),
    )(x)
    x = tf.keras.layers.Dropout(dropout2)(x)
    output1 = Dense(
        4, 
        activation='softmax', 
        kernel_initializer=tf.keras.initializers.GlorotUniform(seed=2112),
        kernel_regularizer=tf.keras.regularizers.l2(l2_lambda),
    )(x)
    output2 = Dense(
        3,
        activation='softmax',
        kernel_initializer=tf.keras.initializers.GlorotUniform(seed=2112),
        kernel_regularizer=tf.keras.regularizers.l2(l2_lambda),
    )(x)
    output3 = Dense(
        8,
        activation='softmax',
        kernel_initializer=tf.keras.initializers.GlorotUniform(seed=2112),
        kernel_regularizer=tf.keras.regularizers.l2(l2_lambda),
    )(x)
    output4 = Dense(
        4, 
        activation='softmax',
        kernel_initializer=tf.keras.initializers.GlorotUniform(seed=2112),
        kernel_regularizer=tf.keras.regularizers.l2(l2_lambda),
    )(x)

    model = tf.keras.models.Model(
        inputs=input, 
        outputs=[output1, output2, output3, output4]
    )
    model.compile(
        optimizer='adam',
        loss=[
            tf.keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
            tf.keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
            tf.keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
            tf.keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
        ],
        loss_weights=[1., 1., 1., 1.],
    )
    return model

In [None]:
def balanced_logloss_(y_pred, y_true, eps=1e-7):
    n0 = np.sum(1-y_true)
    n1 = np.sum(y_true)
    p1 = np.clip(y_pred, eps, 1-eps)
    p0 = 1-p1
    log_loss0 = - np.sum((1-y_true) * np.log(p0)) / (n0+eps)
    log_loss1 = - np.sum(y_true * np.log(p1)) / (n1+eps)
    return (log_loss0 + log_loss1)/2

In [None]:
def train_validate(
        dataframe,
        input_cols, 
        instance_params,
        fit_params,
        repeated_cv_split,
        n_repetitions=REPETITIONS,
        verbose=False,
    ):

    metrics = list()

    for repeat in range(n_repetitions):
        if verbose:
            print(f"REPEAT NUMBER: {repeat+1}/{n_repetitions}")
        cv_split = repeated_cv_split[f"repeat_{repeat}"]
        n_folds = len(cv_split)
        
        for split in cv_split:
            fold = split["fold"]
            train_idx = split["train_idx"]
            valid_idx = split["valid_idx"]
            if verbose:
                print(f"training model for fold: {fold+1}/{n_folds}")
        
            train_df = dataframe.loc[train_idx,:].reset_index(drop=True)
            valid_df = dataframe.loc[valid_idx,:].reset_index(drop=True)
            
            model = create_model_instance(**instance_params)
            model.fit(
                x = train_df[input_cols].values, 
                y = [
                    train_df[alpha_labels].values, 
                    train_df[beta_labels].values, 
                    train_df[gamma_labels].values,
                    train_df[delta_labels].values,
                ], 
                sample_weight = train_df["weight"].values,
                shuffle = True,
                verbose = 0,
                **fit_params
            )
            
            out1,_,_,_ = model.predict(valid_df[input_cols].values, verbose=0)
            y_pred = out1[:,1:].sum(axis=1)

            metrics.append( balanced_logloss_(y_pred, valid_df["Class"].values) )
    
    return np.mean(metrics), np.std(metrics)


def objective(trial):
    
    instance_params = dict(
        hidden_size = trial.suggest_int("hidden_size", 32, 512, 16),
        dropout1 = trial.suggest_float("dropout1", 0.0, 0.2),
        dropout2 = trial.suggest_float("dropout2", 0.0, 0.2),
        l2_lambda = trial.suggest_float("l2_lambda", 1e-10, 1e-1, log=True),
        label_smoothing = trial.suggest_float("label_smoothing", 0.0, 0.1),
        #activation = trial.suggest_categorical("activation", ["relu","sigmoid","tanh"]), 
    ) 
    fit_params = dict(
        epochs = trial.suggest_int("epochs", 100, 400, 10), 
        batch_size = trial.suggest_int("batch_size", 16, 256, 16),
    )
    
    metric_mean, metric_std = train_validate(
        dataframe = train,
        input_cols = input_cols,
        instance_params = instance_params,
        fit_params = fit_params,
        repeated_cv_split = repeated_cv_split,
        n_repetitions = REPETITIONS,
        verbose = False,
    )
    
    return metric_mean

In [None]:
%%time

instance_params = {
    "hidden_size":100,
    "dropout1":0.05,
    "dropout2":0.05,
    "l2_lambda":1e-4,
    "label_smoothing":0.01,
    "activation":"relu",
}
fit_params = {
    "epochs":100,
    "batch_size":32,
}

train_validate(
    dataframe = train,
    input_cols = input_cols,
    instance_params = instance_params,
    fit_params = fit_params,
    repeated_cv_split = repeated_cv_split,
    n_repetitions = REPETITIONS,
    verbose = False,
)

In [None]:
do_optimize = True

study = optuna.create_study(
    study_name="iarc-mlp",
    direction='minimize',
    storage='sqlite:///iarc-mlp.db',
    load_if_exists=True,
)

if do_optimize:
    study.optimize(
        objective, 
        n_trials=1000, 
        timeout=43200, # 12 hours
        n_jobs=1, 
        gc_after_trial=True,
    ) 

In [None]:
study.trials_dataframe().sort_values("value", ascending=True).head(20)

In [None]:
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_slice(study)

In [None]:
plot_edf(study)

In [None]:
plot_parallel_coordinate(study)

In [None]:
best_params = dict(study.best_params)
best_params

***