<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [10]</a>'.</span>

In [1]:
import gc
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

import category_encoders as ce
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

import optuna
from optuna.visualization import (
    plot_contour
    , plot_edf
    , plot_intermediate_values
    , plot_optimization_history
    , plot_parallel_coordinate
    , plot_param_importances
    , plot_slice
)

import sys
sys.path.append("../utils")
from metrics import compute_recall_at4, compute_normalized_gini, compute_amex_metric

np.random.seed(2112)
pd.set_option('display.max_columns', None)

***
## load and prepare data

In [2]:
!ls ../data/processed/dsv02

test.parquet  train.parquet


In [3]:
train = pd.read_parquet("../data/processed/dsv02/train.parquet")
train_labels = pd.read_csv("../data/raw/train_labels.csv", index_col="customer_ID")

In [4]:
input_feats = train.columns.tolist()
categ_feats = [
    'B_30_first', 'B_38_first', 'D_114_first', 'D_116_first', 'D_117_first', 
    'D_120_first', 'D_126_first', 'D_63_first', 'D_64_first', 'D_66_first', 'D_68_first',
    'B_30_last', 'B_38_last', 'D_114_last', 'D_116_last', 'D_117_last', 
    'D_120_last', 'D_126_last', 'D_63_last', 'D_64_last', 'D_66_last', 'D_68_last',
]
len(input_feats)

1562

In [5]:
train = pd.merge(train, train_labels, how="inner", left_index=True, right_index=True)
train = train.reset_index()

del train_labels
gc.collect()

0

***
## model tuning


In [6]:
skf = StratifiedKFold(n_splits=3, random_state=2112, shuffle=True)
skf_split = list(skf.split(train, train["target"].values))

In [7]:
default_params = {
    # general parameters
    'booster':'gblinear',
    'objective':'binary:logistic',
    'seed':2112,
    'eta': 0.05,
    'updater': 'coord_descent',
    'feature_selector' : 'thrifty',
}

In [8]:
def train_models(dataframe: pd.DataFrame, split: list, model_params: dict) -> pd.DataFrame:
        
    # dataframe to store the oof predictions
    oof = dataframe[["target"]].copy()
    oof["pred"] = -1
    MAX_ITERATIONS = model_params.pop("iterations")

    for train_idx,valid_idx in split:
        
        train_df = dataframe.loc[train_idx,:]
        valid_df = dataframe.loc[valid_idx,:]
                
        encoder = ce.glmm.GLMMEncoder()
        encoder.fit(train_df[categ_feats], train_df["target"].values)
        train_df[categ_feats] = encoder.transform(train_df[categ_feats])
        valid_df[categ_feats] = encoder.transform(valid_df[categ_feats])
        
        scaler = StandardScaler(with_mean=True, with_std=True)
        scaler.fit(train_df[input_feats].values)
        train_df[input_feats] = scaler.transform(train_df[input_feats].values)
        valid_df[input_feats] = scaler.transform(valid_df[input_feats].values)
                        
        train_dset = xgb.DMatrix(
            data=train_df.loc[:,input_feats],
            label=train_df.loc[:,"target"].values,
        )
        valid_dset = xgb.DMatrix(
            data=valid_df.loc[:,input_feats],
            label=valid_df.loc[:,"target"].values,
        )
        
        model = xgb.train(
            params = model_params,
            dtrain=train_dset,
            num_boost_round=MAX_ITERATIONS,
        )        
        oof.loc[valid_df.index,"pred"] = model.predict(valid_dset)
        
        del train_df,valid_df,train_dset,valid_dset,model,encoder,scaler
        gc.collect()
    
    return oof

In [9]:
def objective(trial):
    sampled_params = {
        # general booster config
        "iterations" : trial.suggest_int("iterations", 500, 2000, 100),
        # regularization
        "alpha" : trial.suggest_float("alpha", 0., 10.),
        "lambda" : trial.suggest_float("lambda", 0., 10.),
        "top_k" : trial.suggest_int("top_k", 100, 1000, 25),
    }
    
    model_params = {**default_params, **sampled_params}
    
    oof = train_models(train, skf_split, model_params)
    metric = compute_amex_metric(oof.target.values, oof.pred.values)
    return metric

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [10]:
do_optimize = True

study = optuna.create_study(
    study_name="xgboost-gblinear-dsv02",
    direction='maximize',
    storage='sqlite:///xgboost-gblinear-dsv02.db',
    load_if_exists=True,
)

if do_optimize:
    study.optimize(
        objective, 
        n_trials=1000, 
        timeout=28800, # 8-hours
        n_jobs=1, 
        gc_after_trial=True,
    ) 

[32m[I 2022-07-19 14:00:59,819][0m Using an existing study with name 'xgboost-gblinear-dsv02' instead of creating a new one.[0m
[33m[W 2022-07-19 16:09:14,484][0m Trial 1 failed because of the following error: KeyError('iterations')[0m
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/optuna/study/_optimize.py", line 213, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_1248/884711531.py", line 13, in objective
    oof = train_models(train, skf_split, model_params)
  File "/tmp/ipykernel_1248/1026321051.py", line 31, in train_models
    MAX_ITERATIONS = model_params.pop("iterations")
KeyError: 'iterations'


KeyError: 'iterations'

In [None]:
study.trials_dataframe().sort_values("value", ascending=False).head(20)

In [None]:
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_slice(study)

In [None]:
plot_edf(study)

In [None]:
plot_parallel_coordinate(study)

In [None]:
best_params = dict(study.best_params)
best_params

***