In [None]:
import gc
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import yaml

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

import optuna
from optuna.visualization import (
    plot_contour
    , plot_edf
    , plot_intermediate_values
    , plot_optimization_history
    , plot_parallel_coordinate
    , plot_param_importances
    , plot_slice
)

import sys
sys.path.append("../utils")
from metrics import compute_recall_at4, compute_normalized_gini, compute_amex_metric

np.random.seed(2112)
pd.set_option('display.max_columns', None)

In [None]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "jupyterlab"

In [None]:
lgb.__version__

***
## load and prepare data

In [None]:
!ls ../data/processed/dsv04

In [None]:
train = pd.read_parquet("../data/processed/dsv04/train.parquet")
train_labels = pd.read_csv("../data/raw/train_labels.csv", index_col="customer_ID")

In [None]:
input_feats = train.columns.tolist()
len(input_feats)

In [None]:
train = pd.merge(train, train_labels, how="inner", left_index=True, right_index=True)
train = train.reset_index()

del train_labels
gc.collect()

***
## model tuning

train with repeated cross validation

In [None]:
skf = StratifiedKFold(n_splits=3, random_state=2112, shuffle=True)
skf_split = list(skf.split(train, train["target"].values))

In [None]:
default_params = {
    'objective': 'binary',
    'metric': 'None',
    'learning_rate': 0.05,
    'force_col_wise': True,
    'bagging_freq': 1,
    'seed': 2112,
    'verbosity': -1,
    'bin_construct_sample_cnt': 100000000,
    'feature_pre_filter': True,
}

In [None]:
def train_models(dataframe: pd.DataFrame, split: list, model_params: dict) -> pd.DataFrame:
    
    # dataframe to store the oof predictions
    oof = dataframe[["target"]].copy()
    oof["pred"] = -1

    for train_idx,valid_idx in split:
        
        train_df = dataframe.loc[train_idx,:]
        valid_df = dataframe.loc[valid_idx,:]
        
        train_dset = lgb.Dataset(
            data=train_df[input_feats],
            label=train_df["target"].values,
            free_raw_data=True
        )        
        model = lgb.train(
            params=model_params,
            train_set=train_dset
        )        
        oof.loc[valid_idx,"pred"] = model.predict(valid_df[input_feats])
            
        del train_dset,model
        gc.collect()
    
    return oof

In [None]:
def objective(trial):
    sampled_params = dict(
        # general booster config
        max_bin = 2**trial.suggest_int("max_bin_exp", 6, 10) - 1,
        num_leaves = 2**trial.suggest_int("num_leaves_exp", 4, 8) - 1,
        num_iterations = trial.suggest_int("num_iterations", 1000, 3000, 50),
        # regularization
        feature_fraction = trial.suggest_discrete_uniform("feature_fraction", 0.1, 0.4, 0.05),
        bagging_fraction = trial.suggest_discrete_uniform("bagging_fraction", 0.8, 1.0, 0.05),
        lambda_l1 = trial.suggest_float("lambda_l1", 0., 100.),
        lambda_l2 = trial.suggest_float("lambda_l2", 0., 10.),
        min_data_in_leaf = trial.suggest_int("min_data_in_leaf", 500, 3000, 100),
        path_smooth = trial.suggest_float("path_smooth", 0., 100.),
        min_gain_to_split = trial.suggest_loguniform("min_gain_to_split", 1e-10, 1e0),
    )
    model_params = {**default_params, **sampled_params}
    
    oof = train_models(train, skf_split, model_params)
    metric = compute_amex_metric(oof.target.values, oof.pred.values)
    return metric

In [None]:
do_optimize = True

study = optuna.create_study(
    study_name="lgbm-bce-dsv04",
    direction='maximize',
    storage='sqlite:///lgbm-bce-dsv04.db',
    load_if_exists=True,
)

if do_optimize:
    study.optimize(
        objective, 
        n_trials=1000, 
        timeout=259200, # 3-days
        n_jobs=1, 
        gc_after_trial=True,
    ) 

In [None]:
study.trials_dataframe().sort_values("value", ascending=False).head(20)

In [None]:
plot_optimization_history(study)

In [None]:
try:
    plot_param_importances(study)
except:
    pass

In [None]:
plot_slice(study)

In [None]:
plot_edf(study)

In [None]:
plot_parallel_coordinate(study)

In [None]:
best_params = dict(study.best_params)
best_params["max_bin"] = 2**best_params.pop("max_bin_exp")-1
best_params["num_leaves"] = 2**best_params.pop("num_leaves_exp")-1
best_params = {**default_params, **best_params}
best_params

In [None]:
with open("../data/config/lgbm-bce-dsv04.yml", "w") as file:
    yaml.dump(best_params, file, default_flow_style=False)
    file.close()

***