In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import gc

import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn import preprocessing

import optuna
from optuna.visualization import (
    plot_contour
    , plot_edf
    , plot_intermediate_values
    , plot_optimization_history
    , plot_parallel_coordinate
    , plot_param_importances
    , plot_slice
)

from lofo import LOFOImportance, Dataset, plot_importance
from iterstrat.ml_stratifiers import (
    RepeatedMultilabelStratifiedKFold,
    MultilabelStratifiedShuffleSplit
)
from sklearn.metrics import make_scorer

pd.set_option('display.max_columns', None)

import plotly.express as px
import plotly.io as pio
pio.renderers.default = "png"

***
## loading data

In [None]:
input_path = "../data/raw"

train = pd.read_csv(f"{input_path}/train.csv")
test  = pd.read_csv(f"{input_path}/test.csv")
greeks = pd.read_csv(f"{input_path}/greeks.csv")

train.columns = [col.strip() for col in train.columns]
test.columns = [col.strip() for col in test.columns]

# available features
input_cols = train.columns[1:-1]
categ_cols = ["EJ"]

# we extend train with dummies from greeks
dummies = pd.get_dummies(greeks[["Alpha","Beta","Gamma","Delta"]])
train[dummies.columns] = dummies

# encode of categorical features
encoder = preprocessing.LabelEncoder().fit(train["EJ"])
train["EJ"] = encoder.transform(train["EJ"]).astype(int)
test["EJ"] = encoder.transform(test["EJ"]).astype(int)

display(train)

***
## correlation between features

In [None]:
cols_pearson = train[input_cols].corr(method="pearson")
cols_spearman = train[input_cols].corr(method="spearman")

In [None]:
corrs = cols_spearman.values
corrs = corrs[np.triu_indices(corrs.shape[0])]

fig = plt.figure(figsize=(12,5))
ax = fig.add_subplot(1, 1, 1)
plt.hist(corrs, bins=200)
plt.grid()
x_ticks = np.arange(-1, 1, 0.05)
ax.set_xticks(x_ticks)
plt.xticks(rotation = 90)
plt.title("spearman corr hist")
plt.show()

In [None]:
corrs = cols_pearson.values
corrs = corrs[np.triu_indices(corrs.shape[0])]

fig = plt.figure(figsize=(12,5))
ax = fig.add_subplot(1, 1, 1)
plt.hist(corrs, bins=200)
plt.grid()
x_ticks = np.arange(-1, 1, 0.05)
ax.set_xticks(x_ticks)
plt.xticks(rotation = 90)
plt.title("pearson corr hist")
plt.show()

***
## training

In [None]:
def balanced_logloss_(y_true, y_pred):
    n0 = np.sum(1-y_true)
    n1 = np.sum(y_true)
    p1 = np.clip(y_pred, 1e-15, 1-1e-15)
    p0 = 1-p1
    log_loss0 = - np.sum((1-y_true) * np.log(p0)) / n0
    log_loss1 = - np.sum(y_true * np.log(p1)) / n1
    return (log_loss0 + log_loss1)/2

def balanced_logloss(y_pred: np.ndarray, data: lgb.Dataset):
    y_true = data.get_label()
    return 'balanced_logloss', balanced_logloss_(y_true, y_pred), False 

balanced_logloss_scorer = make_scorer(
    balanced_logloss_, 
    greater_is_better=False
)

In [None]:
pct = train.Class.value_counts(normalize=True)
scale_pos_weight = pct[0]/pct[1]
print("scale_pos_weight:", scale_pos_weight)

cnt = train.Class.value_counts(normalize=False)
neg_bagging_fraction = cnt[1]/cnt[0]
print("neg_bagging_fraction:", neg_bagging_fraction)

In [None]:
#balance_method = "weight"
balance_method = "bagging"

model_params = {
    'objective': 'binary',
    'metric': 'None',
    'learning_rate': 0.005,
    'max_bin': 63,
    'num_leaves': 7,
    'seed': 2112,
    'first_metric_only': False,
    'feature_pre_filter': False,
    'verbosity': -1,
    'feature_fraction': 0.75,
    'lambda_l1': 0.00020196676966216634,
    'lambda_l2': 2.3627262174517976e-09,
    'min_data_in_leaf': 13,
    'min_gain_to_split': 0.0007626326850799573,
    'num_iterations': 1591,
    'path_smooth': 2.652521741711401e-09
}

if balance_method == "weight":
    model_params["scale_pos_weight"] = scale_pos_weight
elif balance_method == "bagging":
    model_params["bagging_freq"] = 1
    model_params["pos_bagging_fraction"] = 1
    model_params["neg_bagging_fraction"] = neg_bagging_fraction
else:
    print("Unknown balance_method")
    
display(model_params)

In [None]:
# base model
lgbm_model = LGBMClassifier(**model_params)
display(lgbm_model)

In [None]:
# input dataset
dset = Dataset(
    df = train,
    target = "Class",
    features = input_cols,
    auto_group_threshold = 0.5,
)

In [None]:
# cv-split
#rmskf = RepeatedMultilabelStratifiedKFold(n_splits=5, n_repeats=20, random_state=2112)
#rmskf_split = list(rmskf.split(train, train[["Class","Alpha_B","Alpha_D","Alpha_G"]]))

msss = MultilabelStratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=2112)
msss_split = list(msss.split(train, train[["Class","Alpha_B","Alpha_D","Alpha_G"]]))

In [None]:
lofo_imp = LOFOImportance(
    dataset = dset,
    scoring = balanced_logloss_scorer,
    model = lgbm_model,
    fit_params = None,
    cv = msss_split,
    n_jobs = 1,
    
)

In [None]:
%%time
importance_df = lofo_imp.get_importance()

In [None]:
importance_df

In [None]:
# plot the means and standard deviations of the importances
plot_importance(importance_df, figsize=(12, 20))

In [None]:
# plot the means and standard deviations of the importances
plot_importance(importance_df, figsize=(12, 20), kind="box")

In [None]:
importance_df.query("importance_mean < - 0.01").feature.values.tolist()

***