## H2O Demonstration
Example of H2O - classification 


Sources:
* https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html#training

In [20]:
import h2o
from h2o.automl import H2OAutoML
import numpy as np
from sklearn.metrics import accuracy_score


In [4]:
# from sklearn.model_selection import RepeatedStratifiedKFold
from multi_train_H2O import MultiTrain
import pandas as pd



In [6]:
# Start the H2O cluster (locally)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.16" 2022-07-19; OpenJDK Runtime Environment (build 11.0.16+8-post-Ubuntu-0ubuntu120.04); OpenJDK 64-Bit Server VM (build 11.0.16+8-post-Ubuntu-0ubuntu120.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.8/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpbk2ddbbs
  JVM stdout: /tmp/tmpbk2ddbbs/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpbk2ddbbs/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,09 secs
H2O_cluster_timezone:,Europe/Nicosia
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.5
H2O_cluster_version_age:,6 hours and 27 minutes
H2O_cluster_name:,H2O_from_python_unknownUser_di4vx0
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.918 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


In [46]:
# Import train/test set into H2O
X_train = h2o.import_file("./data/heart/X_train.csv")
y_train = h2o.import_file("./data/heart/y_train.csv")

# Merge X_train and y_train and drop C1 column which respresented the Passenger ID
training_frame = X_train.merge(y_train).drop(["C1"], axis=1)

X_test = h2o.import_file("./data/heart/X_test.csv")
y_test = h2o.import_file("./data/heart/y_test.csv")

test_frame = X_test.merge(y_test).drop(["C1"], axis=1)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [57]:
print(training_frame)
print("Train shape:", training_frame.shape, "\nTest shape: ", test_frame.shape)

Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,F,M,ASY,ATA,NAP,TA,LVH,Normal,ST,N,Y,Down,Flat,Up,HeartDisease
40,140,289,0,172,0.0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0
49,160,180,0,156,1.0,1,0,0,0,1,0,0,1,0,1,0,0,1,0,1
37,130,283,0,98,0.0,0,1,0,1,0,0,0,0,1,1,0,0,0,1,0
48,138,214,0,108,1.5,1,0,1,0,0,0,0,1,0,0,1,0,1,0,1
54,150,195,0,122,0.0,0,1,0,0,1,0,0,1,0,1,0,0,0,1,0
45,130,237,0,170,0.0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0
54,110,208,0,142,0.0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0
37,140,207,0,130,1.5,0,1,1,0,0,0,0,1,0,0,1,0,1,0,1
48,120,284,0,120,0.0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0
37,130,211,0,142,0.0,1,0,0,0,1,0,0,1,0,1,0,0,0,1,0



Train shape: (734, 21) 
Test shape:  (184, 21)


In [58]:
# Identify predictors (A list/vector of column names to be used in prediction) and response (target column)
predictors = training_frame.columns
response = "HeartDisease"
predictors.remove(response)
# For binary classification, response should be a factor
training_frame[response] = training_frame[response].asfactor()
test_frame[response] = test_frame[response].asfactor()


In [66]:
%%time
# Run AutoML for 5 base models
automl = H2OAutoML(max_models=5, seed=1,verbosity='info')
automl.train(x=predictors, y=response, training_frame=training_frame)


AutoML progress: |
21:02:00.995: Project: AutoML_12_20220915_210200
21:02:00.995: 5-fold cross-validation will be used.
21:02:00.995: Setting stopping tolerance adaptively based on the training frame: 0.03691067352627811
21:02:00.995: Build control seed: 1
21:02:00.996: training frame: Frame key: AutoML_12_20220915_210200_training_py_22_sid_8af8    cols: 21    rows: 734  chunks: 229    size: 455747  checksum: -5620469922269017946
21:02:00.996: validation frame: NULL
21:02:00.996: leaderboard frame: NULL
21:02:00.996: blending frame: NULL
21:02:00.996: response column: HeartDisease
21:02:00.996: fold column: null
21:02:00.996: weights column: null
21:02:00.998: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (7g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF : [def_1 (2g, 10w), XRT (3g, 10w)]}, {GBM : [def_5 (1g, 10w), def_2 (2g, 10w), def_3 (2g, 10w), def_4 (2g, 10w), def_1 (3g, 10w), grid_1 (4g, 60w), lr_annealing (7g, 10w

Unnamed: 0,Unnamed: 1,number_of_trees
0,,35.0




ModelMetricsBinomial: xgboost
** Reported on train data. **

MSE: 0.0844610733248304
RMSE: 0.29062187344525603
LogLoss: 0.2821654556197648
Mean Per-Class Error: 0.11250776961500153
AUC: 0.9499150022840797
AUCPR: 0.95220564996506
Gini: 0.8998300045681593

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.40496331453323364: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,278.0,55.0,0.1652,(55.0/333.0)
1,1,24.0,377.0,0.0599,(24.0/401.0)
2,Total,302.0,432.0,0.1076,(79.0/734.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.404963,0.905162,236.0
1,max f2,0.348346,0.931707,250.0
2,max f0point5,0.708307,0.912329,174.0
3,max accuracy,0.488515,0.892371,223.0
4,max precision,0.993526,1.0,0.0
5,max recall,0.084268,1.0,334.0
6,max specificity,0.993526,1.0,0.0
7,max absolute_mcc,0.404963,0.784047,236.0
8,max min_per_class_accuracy,0.618406,0.882883,202.0
9,max mean_per_class_accuracy,0.488515,0.889529,223.0



Gains/Lift Table: Avg response rate: 54.63 %, avg score: 54.37 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010899,0.98905,1.830424,1.830424,1.0,0.990966,1.0,0.990966,0.01995,0.01995,83.042394,83.042394,0.01995
1,2,0.020436,0.986465,1.830424,1.830424,1.0,0.987655,1.0,0.989421,0.017456,0.037406,83.042394,83.042394,0.037406
2,3,0.029973,0.983302,1.830424,1.830424,1.0,0.985055,1.0,0.988032,0.017456,0.054863,83.042394,83.042394,0.054863
3,4,0.040872,0.981949,1.830424,1.830424,1.0,0.982719,1.0,0.986615,0.01995,0.074813,83.042394,83.042394,0.074813
4,5,0.050409,0.980975,1.830424,1.830424,1.0,0.981369,1.0,0.985623,0.017456,0.092269,83.042394,83.042394,0.092269
5,6,0.100817,0.974218,1.780953,1.805688,0.972973,0.977775,0.986486,0.981699,0.089776,0.182045,78.095302,80.568848,0.179042
6,7,0.149864,0.966075,1.830424,1.813784,1.0,0.970353,0.990909,0.977986,0.089776,0.27182,83.042394,81.378372,0.268817
7,8,0.200272,0.952019,1.731482,1.793068,0.945946,0.959077,0.979592,0.973226,0.087282,0.359102,73.148211,79.306835,0.350093
8,9,0.299728,0.913884,1.705052,1.763863,0.931507,0.93307,0.963636,0.959902,0.169576,0.528678,70.505244,76.386307,0.504654
9,10,0.400545,0.82127,1.657276,1.737035,0.905405,0.873323,0.94898,0.93811,0.167082,0.695761,65.727573,73.703496,0.650716




ModelMetricsBinomial: xgboost
** Reported on cross-validation data. **

MSE: 0.10375640224731326
RMSE: 0.3221124062300508
LogLoss: 0.3464650833118794
Mean Per-Class Error: 0.13327417192753852
AUC: 0.9215474826447396
AUCPR: 0.9062023133595756
Gini: 0.8430949652894792

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.39142730832099915: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,265.0,68.0,0.2042,(68.0/333.0)
1,1,25.0,376.0,0.0623,(25.0/401.0)
2,Total,290.0,444.0,0.1267,(93.0/734.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.391427,0.889941,243.0
1,max f2,0.35593,0.924309,252.0
2,max f0point5,0.672414,0.877702,183.0
3,max accuracy,0.443684,0.873297,229.0
4,max precision,0.993473,1.0,0.0
5,max recall,0.034822,1.0,372.0
6,max specificity,0.993473,1.0,0.0
7,max absolute_mcc,0.391427,0.746922,243.0
8,max min_per_class_accuracy,0.617644,0.84985,196.0
9,max mean_per_class_accuracy,0.443684,0.869017,229.0



Gains/Lift Table: Avg response rate: 54.63 %, avg score: 54.64 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010899,0.983091,1.601621,1.601621,0.875,0.986628,0.875,0.986628,0.017456,0.017456,60.162095,60.162095,0.014453
1,2,0.020436,0.978529,1.568935,1.586367,0.857143,0.980037,0.866667,0.983552,0.014963,0.032419,56.893481,58.636741,0.026413
2,3,0.029973,0.97646,1.830424,1.664022,1.0,0.977908,0.909091,0.981756,0.017456,0.049875,83.042394,66.402176,0.043869
3,4,0.040872,0.97385,1.601621,1.647382,0.875,0.975323,0.9,0.980041,0.017456,0.067332,60.162095,64.738155,0.058323
4,5,0.050409,0.972951,1.568935,1.63254,0.857143,0.973318,0.891892,0.978769,0.014963,0.082294,56.893481,63.254027,0.070282
5,6,0.100817,0.962612,1.830424,1.731482,1.0,0.968011,0.945946,0.97339,0.092269,0.174564,83.042394,73.148211,0.162552
6,7,0.149864,0.95039,1.779579,1.747223,0.972222,0.956867,0.954545,0.967982,0.087282,0.261845,77.957883,74.722285,0.24683
7,8,0.200272,0.934381,1.731482,1.743261,0.945946,0.941541,0.952381,0.961327,0.087282,0.349127,73.148211,74.32609,0.328106
8,9,0.299728,0.89506,1.654904,1.713942,0.90411,0.915193,0.936364,0.946019,0.164589,0.513716,65.490384,71.394242,0.471674
9,10,0.400545,0.812208,1.558334,1.674776,0.851351,0.859201,0.914966,0.924167,0.157107,0.670823,55.833389,67.477565,0.595748




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.880048,0.031049,0.918367,0.877551,0.870748,0.897959,0.835616
1,auc,0.92349,0.017236,0.941131,0.920127,0.928651,0.931801,0.895739
2,err,0.119952,0.031049,0.081633,0.122449,0.129252,0.102041,0.164384
3,err_count,17.6,4.505552,12.0,18.0,19.0,15.0,24.0
4,f0point5,0.866943,0.040295,0.922747,0.87886,0.853365,0.868056,0.811688
5,f1,0.895899,0.02758,0.934783,0.891566,0.881988,0.909091,0.862069
6,f2,0.927539,0.02187,0.947137,0.904645,0.912596,0.954198,0.919118
7,lift_top_group,1.650493,0.418011,1.633333,1.814815,1.934211,1.934211,0.935897
8,logloss,0.346566,0.048051,0.296845,0.360145,0.340935,0.314186,0.42072
9,max_per_class_error,0.202041,0.064239,0.140351,0.166667,0.197183,0.197183,0.308824



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2022-09-15 21:02:06,5.326 sec,0.0,0.5,0.693147,0.5,0.546322,1.0,0.453678
1,,2022-09-15 21:02:06,5.533 sec,5.0,0.339652,0.388141,0.930703,0.932509,1.830424,0.134877
2,,2022-09-15 21:02:06,5.610 sec,10.0,0.314507,0.331334,0.937873,0.939859,1.830424,0.126703
3,,2022-09-15 21:02:06,5.702 sec,15.0,0.30427,0.309867,0.943377,0.945338,1.830424,0.115804
4,,2022-09-15 21:02:06,5.845 sec,20.0,0.299175,0.298126,0.945609,0.947312,1.830424,0.113079
5,,2022-09-15 21:02:06,5.973 sec,25.0,0.29544,0.291285,0.947747,0.946397,1.830424,0.113079
6,,2022-09-15 21:02:07,6.106 sec,30.0,0.293306,0.286407,0.949106,0.950361,1.830424,0.113079
7,,2022-09-15 21:02:07,6.287 sec,35.0,0.290622,0.282165,0.949915,0.952206,1.830424,0.107629



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,Up,307.069916,1.0,0.365823
1,N,124.096863,0.404132,0.147841
2,ASY,105.26989,0.342821,0.125412
3,Cholesterol,71.912399,0.234189,0.085672
4,Oldpeak,57.795395,0.188216,0.068854
5,Age,40.742313,0.132681,0.048538
6,F,32.721039,0.106559,0.038982
7,MaxHR,32.234653,0.104975,0.038402
8,Flat,21.671957,0.070577,0.025819
9,FastingBS,17.57732,0.057242,0.02094




In [88]:
automl.leaderboard

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
XGBoost_1_AutoML_12_20220915_210200,0.921547,0.346465,0.906202,0.133274,0.322112,0.103756




In [87]:
preds = automl.predict(test_frame)
y_true = np.array(y_test.as_data_frame()['HeartDisease'])
y_pred = np.array(preds.as_data_frame()['predict'])
accuracy_score(y_true, y_pred)

0.5054347826086957

In [89]:
train_preds = automl.predict(training_frame)
train_y_true = np.array(y_train.as_data_frame()['HeartDisease'])
train_y_pred = np.array(train_preds.as_data_frame()['predict'])
accuracy_score(train_y_true, train_y_pred)

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%


0.494550408719346