## H2O Auto ML

In [3]:
import h2o
from h2o.automl import H2OAutoML, get_leaderboard
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.6" 2020-01-14; OpenJDK Runtime Environment (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1); OpenJDK 64-Bit Server VM (build 11.0.6+10-post-Ubuntu-1ubuntu118.04.1, mixed mode, sharing)
  Starting server from /home/user7/.local/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp8fgk4ijb
  JVM stdout: /tmp/tmp8fgk4ijb/h2o_user7_started_from_python.out
  JVM stderr: /tmp/tmp8fgk4ijb/h2o_user7_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Europe/Athens
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.2
H2O cluster version age:,10 days
H2O cluster name:,H2O_from_python_user7_8bmr70
H2O cluster total nodes:,1
H2O cluster free memory:,1.781 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [5]:
# Import a sample binary outcome train/test set into H2O
train = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test = h2o.import_file("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")

# Identify predictors and response
x = train.columns
y = "response"
x.remove(y)

# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [None]:
type(x)
type(y)
type(train)
type(test)

In [6]:
# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1)


aml.train(x=x, y=y, training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [8]:
# AutoML Leaderboard
lb = aml.leaderboard
lb

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_AutoML_20200131_132202,0.789034,0.552297,0.803844,0.31333,0.432568,0.187115
StackedEnsemble_BestOfFamily_AutoML_20200131_132202,0.788366,0.553096,0.803551,0.314169,0.432843,0.187353
XGBoost_grid__1_AutoML_20200131_132202_model_3,0.784783,0.558539,0.802597,0.328578,0.435036,0.189256
XGBoost_grid__1_AutoML_20200131_132202_model_1,0.784653,0.5581,0.802569,0.319823,0.434987,0.189214
XGBoost_3_AutoML_20200131_132202,0.783759,0.558382,0.80096,0.322163,0.435252,0.189445
XGBoost_grid__1_AutoML_20200131_132202_model_4,0.783664,0.557696,0.802604,0.33427,0.43517,0.189373
GBM_5_AutoML_20200131_132202,0.78219,0.558353,0.800234,0.319658,0.435512,0.18967
XGBoost_2_AutoML_20200131_132202,0.782155,0.557366,0.801728,0.337128,0.435331,0.189514
XGBoost_1_AutoML_20200131_132202,0.780395,0.5593,0.800388,0.322149,0.436235,0.190301
GBM_2_AutoML_20200131_132202,0.777673,0.562514,0.796181,0.334056,0.437583,0.191479




In [9]:
# Optionally add extra model information to the leaderboard
lb = get_leaderboard(aml, extra_columns='ALL')
lb

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse,training_time_ms,predict_time_per_row_ms
StackedEnsemble_AllModels_AutoML_20200131_132202,0.789034,0.552297,0.803844,0.31333,0.432568,0.187115,1325,0.090722
StackedEnsemble_BestOfFamily_AutoML_20200131_132202,0.788366,0.553096,0.803551,0.314169,0.432843,0.187353,895,0.042382
XGBoost_grid__1_AutoML_20200131_132202_model_3,0.784783,0.558539,0.802597,0.328578,0.435036,0.189256,3208,0.004097
XGBoost_grid__1_AutoML_20200131_132202_model_1,0.784653,0.5581,0.802569,0.319823,0.434987,0.189214,6082,0.004136
XGBoost_3_AutoML_20200131_132202,0.783759,0.558382,0.80096,0.322163,0.435252,0.189445,3726,0.003931
XGBoost_grid__1_AutoML_20200131_132202_model_4,0.783664,0.557696,0.802604,0.33427,0.43517,0.189373,2993,0.003423
GBM_5_AutoML_20200131_132202,0.78219,0.558353,0.800234,0.319658,0.435512,0.18967,1132,0.006699
XGBoost_2_AutoML_20200131_132202,0.782155,0.557366,0.801728,0.337128,0.435331,0.189514,4262,0.005729
XGBoost_1_AutoML_20200131_132202,0.780395,0.5593,0.800388,0.322149,0.436235,0.190301,5700,0.005101
GBM_2_AutoML_20200131_132202,0.777673,0.562514,0.796181,0.334056,0.437583,0.191479,950,0.00624




In [10]:
# Print all rows (instead of default 10 rows)
lb.head(rows=lb.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse,training_time_ms,predict_time_per_row_ms
StackedEnsemble_AllModels_AutoML_20200131_132202,0.789034,0.552297,0.803844,0.31333,0.432568,0.187115,1325,0.090722
StackedEnsemble_BestOfFamily_AutoML_20200131_132202,0.788366,0.553096,0.803551,0.314169,0.432843,0.187353,895,0.042382
XGBoost_grid__1_AutoML_20200131_132202_model_3,0.784783,0.558539,0.802597,0.328578,0.435036,0.189256,3208,0.004097
XGBoost_grid__1_AutoML_20200131_132202_model_1,0.784653,0.5581,0.802569,0.319823,0.434987,0.189214,6082,0.004136
XGBoost_3_AutoML_20200131_132202,0.783759,0.558382,0.80096,0.322163,0.435252,0.189445,3726,0.003931
XGBoost_grid__1_AutoML_20200131_132202_model_4,0.783664,0.557696,0.802604,0.33427,0.43517,0.189373,2993,0.003423
GBM_5_AutoML_20200131_132202,0.78219,0.558353,0.800234,0.319658,0.435512,0.18967,1132,0.006699
XGBoost_2_AutoML_20200131_132202,0.782155,0.557366,0.801728,0.337128,0.435331,0.189514,4262,0.005729
XGBoost_1_AutoML_20200131_132202,0.780395,0.5593,0.800388,0.322149,0.436235,0.190301,5700,0.005101
GBM_2_AutoML_20200131_132202,0.777673,0.562514,0.796181,0.334056,0.437583,0.191479,950,0.00624


