<h1>h2o Python AutoML

In [1]:
import h2o
from h2o.automl import H2OAutoML

In [15]:
#Setup h2o instance
h2o.init()

In [3]:
#Import data
data = h2o.import_file(path='http://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data')

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
#Drop the identifier column C1
data = data.drop('C1',axis=1)
data['C10'] = data['C10'].asfactor() #last column is C10 our response variable 
data['C10'].levels() #see the no. of levels of the predicted response i.e. 10 levels

[['CYT', 'ERL', 'EXC', 'ME1', 'ME2', 'ME3', 'MIT', 'NUC', 'POX', 'VAC']]

In [5]:
#Take a look at the data
data.describe()

Rows:1484
Cols:9




Unnamed: 0,C2,C3,C4,C5,C6,C7,C8,C9,C10
type,real,real,real,real,real,real,real,real,enum
mins,0.11,0.13,0.21,0.0,0.5,0.0,0.0,0.0,
mean,0.5001212938005386,0.4999326145552561,0.5000336927223715,0.2611859838274927,0.5047169811320772,0.0075,0.4998854447439343,0.27619946091644226,
maxs,1.0,1.0,1.0,1.0,1.0,0.83,0.73,1.0,
sigma,0.13729930038958169,0.12392434900413829,0.08667024770783183,0.13709763089421423,0.04835096692671337,0.07568266520506624,0.057796586389259656,0.10649052826089457,
zeros,0,0,0,2,0,1469,1,1,
missing,0,0,0,0,0,0,0,0,0
0,0.58,0.61,0.47,0.13,0.5,0.0,0.48,0.22,MIT
1,0.43,0.67,0.48,0.27,0.5,0.0,0.53,0.22,MIT
2,0.64,0.62,0.49,0.15,0.5,0.0,0.53,0.22,MIT


In [6]:
#Split data into 70% train, 30% test
train, test = data.split_frame([0.7], seed=1234)

In [7]:
#Prepare predictors and response columns
data_X = data.col_names[:-1]     
data_y = data.col_names[-1]  

In [9]:
#Run AutoML for 60 sec 
aml = H2OAutoML(max_runtime_secs = 60)
aml.train(x = data_X, y = data_y, training_frame = train, leaderboard_frame = test)

AutoML progress: |████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [10]:
#View the AutoML Leaderboard
aml.leaderboard
aml.leader

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_grid_1_AutoML_20171114_150533_model_2


ModelMetricsMultinomial: gbm
** Reported on train data. **

MSE: 0.21010505795464443
RMSE: 0.4583721827888822
LogLoss: 0.6129617015862536
Mean Per-Class Error: 0.05796412621081877
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8,9,10,11
CYT,ERL,EXC,ME1,ME2,ME3,MIT,NUC,POX,VAC,Error,Rate
201.0,0.0,0.0,0.0,0.0,0.0,1.0,14.0,0.0,0.0,0.0694444,15 / 216
0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0 / 2
0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0 / 21
0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0 / 20
0.0,0.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0 / 23
0.0,0.0,0.0,0.0,0.0,80.0,0.0,1.0,0.0,0.0,0.0123457,1 / 81
9.0,0.0,0.0,0.0,0.0,2.0,107.0,4.0,0.0,0.0,0.1229508,15 / 122
19.0,0.0,0.0,0.0,0.0,0.0,3.0,206.0,0.0,0.0,0.0964912,22 / 228
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0909091,1 / 11


Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.922973
2,0.9891892
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0



ModelMetricsMultinomial: gbm
** Reported on validation data. **

MSE: 0.43683559773096176
RMSE: 0.6609353960342582
LogLoss: 1.2643079794684668
Mean Per-Class Error: 0.37536689402382983
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8,9,10,11
CYT,ERL,EXC,ME1,ME2,ME3,MIT,NUC,POX,VAC,Error,Rate
65.0,0.0,0.0,0.0,0.0,2.0,12.0,28.0,0.0,0.0,0.3925234,42 / 107
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0 / 0
0.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.6,3 / 5
1.0,0.0,0.0,6.0,1.0,0.0,0.0,0.0,1.0,0.0,0.3333333,3 / 9
2.0,0.0,1.0,0.0,4.0,3.0,1.0,1.0,0.0,0.0,0.6666667,8 / 12
0.0,0.0,0.0,0.0,0.0,31.0,1.0,4.0,0.0,1.0,0.1621622,6 / 37
10.0,0.0,0.0,0.0,0.0,2.0,24.0,4.0,1.0,1.0,0.4285714,18 / 42
17.0,0.0,0.0,0.0,0.0,4.0,9.0,59.0,0.0,0.0,0.3370787,30 / 89
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0 / 0


Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.6254072
2,0.8208469
3,0.9250814
4,0.9478827
5,0.9641694
6,0.9934853
7,0.9934853
8,0.9934853
9,0.9934853



ModelMetricsMultinomial: gbm
** Reported on cross-validation data. **

MSE: 0.428554248969978
RMSE: 0.6546405494391392
LogLoss: 1.2219202103827622
Mean Per-Class Error: 0.4812461371353657
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7,8,9,10,11
CYT,ERL,EXC,ME1,ME2,ME3,MIT,NUC,POX,VAC,Error,Rate
137.0,0.0,0.0,0.0,0.0,3.0,14.0,59.0,1.0,2.0,0.3657407,79 / 216
0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1 / 2
1.0,0.0,13.0,1.0,1.0,0.0,1.0,2.0,0.0,2.0,0.3809524,8 / 21
0.0,0.0,1.0,11.0,5.0,0.0,3.0,0.0,0.0,0.0,0.45,9 / 20
1.0,0.0,2.0,3.0,8.0,0.0,6.0,2.0,0.0,1.0,0.6521739,15 / 23
2.0,0.0,0.0,0.0,1.0,67.0,3.0,8.0,0.0,0.0,0.1728395,14 / 81
24.0,0.0,0.0,0.0,0.0,6.0,66.0,23.0,2.0,1.0,0.4590164,56 / 122
64.0,0.0,1.0,0.0,1.0,9.0,11.0,142.0,0.0,0.0,0.3771930,86 / 228
3.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,6.0,0.0,0.4545455,5 / 11


Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.6094595
2,0.8229730
3,0.9256756
4,0.9621621
5,0.9810811
6,0.9864865
7,0.9905406
8,0.9932433
9,0.995946


Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.6094595,0.0240601,0.5540541,0.5945946,0.6554054,0.6148649,0.6283784
err,0.3905405,0.0240601,0.4459459,0.4054054,0.3445946,0.3851351,0.3716216
err_count,57.8,3.5608988,66.0,60.0,51.0,57.0,55.0
logloss,1.2219203,0.0543167,1.2767403,1.3425394,1.1299363,1.1818229,1.178562
max_per_class_error,1.0,0.0,1.0,1.0,1.0,1.0,1.0
mean_per_class_accuracy,0.5535638,0.0397353,0.5711603,0.4934147,0.4853073,0.5859129,0.6320237
mean_per_class_error,0.4464362,0.0397353,0.4288397,0.5065852,0.5146927,0.4140871,0.3679763
mse,0.4285542,0.0212189,0.4561387,0.4685339,0.3871244,0.4220141,0.4089602
r2,0.9525329,0.0032345,0.9478238,0.9462871,0.9568645,0.9546739,0.957015


Scoring History: 


0,1,2,3,4,5,6,7,8,9
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,validation_rmse,validation_logloss,validation_classification_error
,2017-11-14 15:06:20,10.155 sec,0.0,0.9000000,2.3025851,0.7918919,0.9000000,2.3025851,0.8110749
,2017-11-14 15:06:20,10.186 sec,5.0,0.8723322,2.0625625,0.1729730,0.8818709,2.1428497,0.4104235
,2017-11-14 15:06:20,10.233 sec,10.0,0.8447060,1.8724692,0.1621622,0.8635032,2.0112799,0.3941368
,2017-11-14 15:06:21,10.267 sec,15.0,0.8178372,1.7179539,0.15,0.8466052,1.9095682,0.4006515
,2017-11-14 15:06:21,10.314 sec,20.0,0.7920063,1.5894292,0.1554054,0.8305732,1.8241091,0.3941368
---,---,---,---,---,---,---,---,---,---
,2017-11-14 15:06:22,11.253 sec,90.0,0.5114226,0.7259971,0.0905405,0.6830756,1.3144659,0.3745928
,2017-11-14 15:06:22,11.376 sec,95.0,0.4968667,0.6941861,0.0878378,0.6770667,1.3005966,0.3745928
,2017-11-14 15:06:22,11.511 sec,100.0,0.4832791,0.6649525,0.0824324,0.6713909,1.2873337,0.3778502



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
C4,3558.4880371,1.0,0.2496415
C5,2633.1718750,0.7399693,0.1847270
C2,2463.7795410,0.6923670,0.1728435
C3,2116.5615234,0.5947924,0.1484849
C9,1840.3018799,0.5171584,0.1291042
C8,1327.9913330,0.3731898,0.0931637
C7,257.7365723,0.0724287,0.0180812
C6,56.3626289,0.0158389,0.0039541




In [11]:
#Predict
preds = aml.predict(test)

Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


In [12]:
#See Score
aml.leader.score_history()

Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_classification_error,validation_rmse,validation_logloss,validation_classification_error
0,,2017-11-14 15:06:20,10.155 sec,0.0,0.9,2.302585,0.791892,0.9,2.302585,0.811075
1,,2017-11-14 15:06:20,10.186 sec,5.0,0.872332,2.062563,0.172973,0.881871,2.14285,0.410423
2,,2017-11-14 15:06:20,10.233 sec,10.0,0.844706,1.872469,0.162162,0.863503,2.01128,0.394137
3,,2017-11-14 15:06:21,10.267 sec,15.0,0.817837,1.717954,0.15,0.846605,1.909568,0.400651
4,,2017-11-14 15:06:21,10.314 sec,20.0,0.792006,1.589429,0.155405,0.830573,1.824109,0.394137
5,,2017-11-14 15:06:21,10.358 sec,25.0,0.766785,1.477989,0.15,0.815507,1.752003,0.381107
6,,2017-11-14 15:06:21,10.406 sec,30.0,0.741741,1.378205,0.141892,0.800772,1.687941,0.37785
7,,2017-11-14 15:06:21,10.453 sec,35.0,0.718211,1.292142,0.136486,0.787308,1.634031,0.371336
8,,2017-11-14 15:06:21,10.500 sec,40.0,0.695481,1.215359,0.131081,0.774425,1.586005,0.374593
9,,2017-11-14 15:06:21,10.562 sec,45.0,0.67316,1.144548,0.127027,0.762191,1.542511,0.374593


In [13]:
#Training and Testing Hit Ratio (i.e. accuracy) 
aml.leader.hit_ratio_table(train=True, valid=True)

Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.922973
2,0.9891892
3,1.0
4,1.0
5,1.0
6,1.0
7,1.0
8,1.0
9,1.0


Top-10 Hit Ratios: 


0,1
k,hit_ratio
1,0.6254072
2,0.8208469
3,0.9250814
4,0.9478827
5,0.9641694
6,0.9934853
7,0.9934853
8,0.9934853
9,0.9934853


{'train': , 'valid': }

In [14]:
#Shutdown cluster
h2o.shutdown(prompt=False)

    >>> h2o.shutdown(prompt=False)
        ^^^^ Deprecated, use ``h2o.cluster().shutdown()``.
H2O session _sid_8e1c closed.
