<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [10]</a>'.</span>

In [1]:
import gc
from glob import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

import h2o
from h2o.estimators import H2OGradientBoostingEstimator
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

import optuna
from optuna.visualization import (
    plot_contour
    , plot_edf
    , plot_intermediate_values
    , plot_optimization_history
    , plot_parallel_coordinate
    , plot_param_importances
    , plot_slice
)

import sys
sys.path.append("../utils")
from metrics import compute_recall_at4, compute_normalized_gini, compute_amex_metric

np.random.seed(2112)
pd.set_option('display.max_columns', None)

H2O_server = h2o.init(port=54321, nthreads=-1)
h2o.remove_all()

Checking whether there is an H2O instance running at http://localhost:54321 .

 connected.


0,1
H2O_cluster_uptime:,5 hours 27 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.4
H2O_cluster_version_age:,15 days
H2O_cluster_name:,H2O_from_python_unknownUser_bsjrp1
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,9.81 Gb
H2O_cluster_total_cores:,20
H2O_cluster_allowed_cores:,20


***
## load and prepare data

In [2]:
!ls ../data/processed/dsv06

test.parquet  train.parquet


In [3]:
train = pd.read_parquet("../data/processed/dsv06/train.parquet")
train_labels = pd.read_csv("../data/raw/train_labels.csv", index_col="customer_ID")

In [4]:
input_feats = train.columns.tolist()
len(input_feats)

1308

In [5]:
train = pd.merge(train, train_labels, how="inner", left_index=True, right_index=True)
train = train.reset_index(drop=True)

del train_labels
gc.collect()

12

***
## model tuning


In [6]:
%%time
# split-data
skf = StratifiedKFold(n_splits=2, random_state=2112, shuffle=True)
skf_split = list(skf.split(train, train["target"].values))

for train_idx,valid_idx in skf_split: 
    # train
    train_df = train.loc[train_idx,:].copy()
    train_df["target"] = train_df["target"].astype(str)
    train_dset = h2o.H2OFrame(
        train_df,
        column_types = {"target":"categorical"},
    )
    # valid
    valid_df = train.loc[valid_idx,:].copy()
    valid_df["target"] = valid_df["target"].astype(str)
    valid_dset = h2o.H2OFrame(
        valid_df,
        column_types = {"target":"categorical"},
    )
    break
    
print(len(train_df))
print(len(valid_df))

Parse progress: |

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█| (done) 100%


Parse progress: |

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

██

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█| (done) 100%


229456
229457
CPU times: user 10min 39s, sys: 32.4 s, total: 11min 12s
Wall time: 11min 51s


In [7]:
default_params = {
    "distribution": "bernoulli",
    "learn_rate": 0.05,
    "max_depth": 5,
    "nbins": 256,
    "seed": 42,
}

In [8]:
def train_models(model_params: dict) -> pd.DataFrame:
        
    # dataframe to store the oof predictions
    oof = valid_df[["target"]].copy()
    oof["target"] = oof["target"].astype(int)
    oof["pred"] = -1

    model = H2OGradientBoostingEstimator(**model_params)        
    model.train(
        x=input_feats, 
        y="target", 
        training_frame=train_dset,
    )
    pred = model.predict(valid_dset).as_data_frame()["p1"].values.flatten()
    oof.loc[valid_df.index,"pred"] = pred
        
    del model
    gc.collect()
        
    return oof

In [9]:
def objective(trial):
    sampled_params = {
        # regularization
        "min_rows": trial.suggest_int("min_rows", 100, 3000, 100),
        "col_sample_rate": trial.suggest_discrete_uniform("col_sample_rate", 0.1, 0.3, 0.05),
        "sample_rate": trial.suggest_discrete_uniform("sample_rate", 0.8, 1.0, 0.05),
        "max_abs_leafnode_pred": trial.suggest_loguniform("max_abs_leafnode_pred", 1e-2, 1e2),
        "min_split_improvement": trial.suggest_loguniform("min_split_improvement", 1e-10, 1e-3),
        "pred_noise_bandwidth": trial.suggest_float("pred_noise_bandwidth", 0., 1.),
        # general parameters
        "ntrees" : trial.suggest_int("ntrees", 1000, 5000, 50),
    }    
    model_params = {**default_params, **sampled_params}
    
    oof = train_models(model_params)
    metric = compute_amex_metric(oof.target.values, oof.pred.values)
    return metric

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [10]:
do_optimize = True

study = optuna.create_study(
    study_name="h2ogbm-dsv06",
    direction='maximize',
    storage='sqlite:///h2ogbm-dsv06.db',
    load_if_exists=True,
)

if do_optimize:
    study.optimize(
        objective, 
        n_trials=1000, 
        timeout=216000, # 2.5-days
        n_jobs=1, 
        gc_after_trial=True,
    ) 

[32m[I 2022-08-19 12:19:41,080][0m Using an existing study with name 'h2ogbm-dsv06' instead of creating a new one.[0m


gbm Model Build progress: |

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█

█| (done) 100%
gbm prediction progress: |

[33m[W 2022-08-19 12:48:21,616][0m Trial 4 failed because of the following error: OSError("Job with key $03017f00000132d4ffffffff$_9672b7f25cd8d6716a00df7c47374e87 failed with an exception: java.lang.IllegalArgumentException: Test/Validation dataset has categorical column 'D_50_diff_last-mean' which is real-valued in the training data\nstacktrace: \njava.lang.IllegalArgumentException: Test/Validation dataset has categorical column 'D_50_diff_last-mean' which is real-valued in the training data\n\tat hex.Model.adaptTestForTrain(Model.java:1768)\n\tat hex.Model.adaptTestForTrain(Model.java:1584)\n\tat hex.Model.adaptTestForTrain(Model.java:1580)\n\tat hex.Model.adaptFrameForScore(Model.java:1911)\n\tat hex.Model.score(Model.java:1929)\n\tat water.api.ModelMetricsHandler$1.compute2(ModelMetricsHandler.java:491)\n\tat water.H2O$H2OCountedCompleter.compute(H2O.java:1677)\n\tat jsr166y.CountedCompleter.exec(CountedCompleter.java:468)\n\tat jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)

 (failed)


OSError: Job with key $03017f00000132d4ffffffff$_9672b7f25cd8d6716a00df7c47374e87 failed with an exception: java.lang.IllegalArgumentException: Test/Validation dataset has categorical column 'D_50_diff_last-mean' which is real-valued in the training data
stacktrace: 
java.lang.IllegalArgumentException: Test/Validation dataset has categorical column 'D_50_diff_last-mean' which is real-valued in the training data
	at hex.Model.adaptTestForTrain(Model.java:1768)
	at hex.Model.adaptTestForTrain(Model.java:1584)
	at hex.Model.adaptTestForTrain(Model.java:1580)
	at hex.Model.adaptFrameForScore(Model.java:1911)
	at hex.Model.score(Model.java:1929)
	at water.api.ModelMetricsHandler$1.compute2(ModelMetricsHandler.java:491)
	at water.H2O$H2OCountedCompleter.compute(H2O.java:1677)
	at jsr166y.CountedCompleter.exec(CountedCompleter.java:468)
	at jsr166y.ForkJoinTask.doExec(ForkJoinTask.java:263)
	at jsr166y.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:976)
	at jsr166y.ForkJoinPool.runWorker(ForkJoinPool.java:1479)
	at jsr166y.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:104)


In [None]:
study.trials_dataframe().sort_values("value", ascending=False).head(20)

In [None]:
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [None]:
plot_slice(study)

In [None]:
plot_edf(study)

In [None]:
plot_parallel_coordinate(study)

In [None]:
best_params = dict(study.best_params)
best_params = {**default_params, **best_params}
best_params

***