In [113]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import h2o

In [114]:
data = pd.read_csv("data/adult.csv")
data['income'] = data.apply(lambda x: 1 if x['income'] == ">50K" else 0, axis=1)
data.drop(labels=['fnlwgt','education'], axis=1, inplace=True)
data['educational-num'] = data['educational-num'].astype(str)
data['income'] = data['income'].astype(str)
data.head()

Unnamed: 0,age,workclass,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,?,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,0


In [127]:
from bayes_opt import BayesianOptimization
h2o.init()
h2o.remove_all()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,18 hours 51 mins
H2O cluster timezone:,America/Santiago
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.3
H2O cluster version age:,27 days
H2O cluster name:,H2O_from_python_maravenag_96enet
H2O cluster total nodes:,1
H2O cluster free memory:,3.228 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [128]:
frame = h2o.H2OFrame(data, destination_frame="train_frame")
frame['income'] = frame['income'].asfactor()
train, test = frame.split_frame(ratios=[0.9], destination_frames=['train', 'test'])

In [129]:
train_cols = [x for x in train.col_names if x != 'income']

In [130]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [207]:
import os

In [211]:
def train_model(max_depth, 
                ntrees,
                min_rows, 
                learn_rate, 
                sample_rate, 
                col_sample_rate):
    params = {
        'max_depth': int(max_depth),
        'ntrees': int(ntrees),
        'min_rows': int(min_rows),
        'learn_rate':learn_rate,
        'sample_rate':sample_rate,
        'col_sample_rate':col_sample_rate
    }
    model = H2OGradientBoostingEstimator(nfolds=5,model_id="best_model",**params)
    model.train(x=train_cols, y='income', training_frame=train)
    h2o_data = h2o.ls()
    h2o_data = list(h2o_data['key'])
    h2o_data = list(filter(lambda x: x not in ["train_frame",'train','test'], h2o_data))
    metric = model.model_performance(xval=True).pr_auc()
    global best
    if metric >= best:
        h2o.save_model(model=model, path=os.getcwd()+"/models/", force=True)
        best = metric
    list(map(lambda x: h2o.remove(x), h2o_data))
    return metric

In [212]:
bounds = {
    'max_depth':(5,10),
    'ntrees': (100,200),
    'min_rows':(10,30),
    'learn_rate':(0.001, 0.01),
    'sample_rate':(0.5,0.8),
    'col_sample_rate':(0.5,0.8)
}

In [205]:
optimizer = BayesianOptimization(
    f=train_model,
    pbounds=bounds,
    random_state=1,
)

In [206]:
best = 0
optimizer.maximize(init_points=10, n_iter=20)

|   iter    |  target   | col_sa... | learn_... | max_depth | min_rows  |  ntrees   | sample... |
-------------------------------------------------------------------------------------------------
|  1        |  0.7678   |  0.6251   |  0.007483 |  5.001    |  16.05    |  11.47    |  0.5277   |
|  2        |  0.7705   |  0.5559   |  0.00411  |  6.984    |  20.78    |  14.19    |  0.7056   |
|  3        |  0.7736   |  0.5613   |  0.008903 |  5.137    |  23.41    |  14.17    |  0.6676   |
|  4        |  0.7781   |  0.5421   |  0.002783 |  9.004    |  29.37    |  13.13    |  0.7077   |
|  5        |  0.7664   |  0.7629   |  0.009051 |  5.425    |  10.78    |  11.7     |  0.7634   |
|  6        |  0.7868   |  0.5295   |  0.00479  |  9.789    |  20.66    |  16.92    |  0.5947   |
|  7        |  0.7645   |  0.706    |  0.008512 |  5.091    |  25.0     |  19.89    |  0.7244   |
|  8        |  0.7745   |  0.5841   |  0.008104 |  5.516    |  18.96    |  19.09    |  0.5881   |
|  9        |  0.756

KeyboardInterrupt: 