## Machine Learning Using H2O

In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pysparkling import *
from h2o.automl import H2OAutoML
from h2o.persist import set_s3_credentials
import h2o
import os

In [2]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()
hc = H2OContext.getOrCreate(ss)

Connecting to H2O server at http://127.0.0.1:54323 ... successful.


0,1
H2O cluster uptime:,08 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.1
H2O cluster version age:,30 days
H2O cluster name:,sparkling-water-sunqingyi_local-1579217939131
H2O cluster total nodes:,1
H2O cluster free memory:,379.2 Mb
H2O cluster total cores:,12
H2O cluster allowed cores:,12



Sparkling Water Context:
 * Sparkling Water Version: 3.28.0.1-1-2.4
 * H2O name: sparkling-water-sunqingyi_local-1579217939131
 * cluster size: 1
 * list of used nodes:
  (executorId, host, port)
  ------------------------
  (driver,127.0.0.1,54323)
  ------------------------

  Open H2O Flow in browser: http://127.0.0.1:54323 (CMD + click in Mac OSX)

    


In [4]:
train = h2o.import_file("mytraindata.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [7]:
predictors = train.names[:]
responce = 'bikes_available'
predictors.remove(responce)
predictors.remove('C1')

In [8]:
predictors

['station_id',
 'hour',
 'is_weekend',
 'temp_float',
 'rain_identifier',
 'start_count',
 'end_count',
 'total_docks',
 'population']

### Deep Learning

In [10]:
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
model_dl = H2ODeepLearningEstimator(variable_importances=True, 
                                    loss="Automatic",
                                    hidden=[7,2],
                                    epochs=0.7,
                                    nfolds=5,
                                    fold_assignment="Modulo")

In [11]:
model_dl.train(x=predictors,
               y=responce,
               training_frame=train)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


In [47]:
model_dl

Model Details
H2ODeepLearningEstimator :  Deep Learning
Model Key:  DeepLearning_model_python_1579217939639_1


Status of Neuron Layers: predicting bikes_available, regression, gaussian distribution, Quadratic loss, 117 weights/biases, 6.6 KB, 199,686 training samples, mini-batch size 1


Unnamed: 0,Unnamed: 1,layer,units,type,dropout,l1,l2,mean_rate,rate_rms,momentum,mean_weight,weight_rms,mean_bias,bias_rms
0,,1,13,Input,0.0,,,,,,,,,
1,,2,7,Rectifier,0.0,0.0,0.0,0.156438,0.373435,0.0,0.0501587,0.327844,0.414674,0.205719
2,,3,2,Rectifier,0.0,0.0,0.0,0.00115459,0.000792882,0.0,-0.193633,0.482805,0.983585,0.135764
3,,4,1,Linear,,0.0,0.0,0.000627632,0.00021298,0.0,-0.484794,0.147284,1.007,1.0971300000000001e-154




ModelMetricsRegression: deeplearning
** Reported on train data. **

MSE: 12.082342936581284
RMSE: 3.4759664751808645
MAE: 2.7805796626321215
RMSLE: 0.40745076528107455
Mean Residual Deviance: 12.082342936581284

ModelMetricsRegression: deeplearning
** Reported on cross-validation data. **

MSE: 11.390507309371314
RMSE: 3.374982564306268
MAE: 2.6840014633453397
RMSLE: 0.3961692932581858
Mean Residual Deviance: 11.390507309371314

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,2.6840017,0.03323998,2.6436238,2.6965103,2.705457,2.6544073,2.7200096
1,mean_residual_deviance,11.390509,0.22242215,11.095568,11.4660225,11.54045,11.224878,11.625622
2,mse,11.390509,0.22242215,11.095568,11.4660225,11.54045,11.224878,11.625622
3,r2,0.2350242,0.014466327,0.2550326,0.22531831,0.22365496,0.24587092,0.22524424
4,residual_deviance,11.390509,0.22242215,11.095568,11.4660225,11.54045,11.224878,11.625622
5,rmse,3.3748536,0.033002634,3.331001,3.3861516,3.3971238,3.350355,3.4096367
6,rmsle,0.39613122,0.006140951,0.38965577,0.40370247,0.3987034,0.3899242,0.39867023



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_deviance,training_mae,training_r2
0,,2020-01-16 15:41:06,0.000 sec,,0.0,0,0.0,,,,
1,,2020-01-16 15:41:06,2.093 sec,821318 obs/sec,0.072078,1,18069.0,3.557071,12.652753,2.760071,0.149499
2,,2020-01-16 15:41:06,2.273 sec,1018806 obs/sec,0.796555,11,199686.0,3.475966,12.082343,2.78058,0.187841



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,total_docks,1.0,1.0,0.213644
1,station_id,0.567871,0.567871,0.121322
2,end_count,0.539404,0.539404,0.11524
3,population,0.47888,0.47888,0.10231
4,is_weekend.True,0.429488,0.429488,0.091758
5,is_weekend.False,0.41445,0.41445,0.088545
6,start_count,0.34579,0.34579,0.073876
7,rain_identifier.true,0.316672,0.316672,0.067655
8,rain_identifier.false,0.232141,0.232141,0.049596
9,temp_float,0.184454,0.184454,0.039407




### AutoML

In [39]:
model_automl = H2OAutoML(max_models = 10,
                         seed = 1,
                         nfolds = 5, 
                         max_runtime_secs=1000)

In [40]:
model_automl.train(x=predictors,
                   y=responce, 
                   training_frame=train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [30]:
# Get model ids for all models in the AutoML Leaderboard
model_ids = list(model_automl.leaderboard['model_id'].as_data_frame().iloc[:,0])

In [41]:
lb2 = model_automl.leaderboard
lb2

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
XGBoost_1_AutoML_20200116_161833,5.65176,2.37734,5.65176,1.82444,0.294152




In [52]:
h2o.get_model('XGBoost_1_AutoML_20200116_161833')

Model Details
H2OXGBoostEstimator :  XGBoost
Model Key:  XGBoost_1_AutoML_20200116_161833


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees
0,,275.0




ModelMetricsRegression: xgboost
** Reported on train data. **

MSE: 4.93951995269894
RMSE: 2.222503082719783
MAE: 1.7094994363473002
RMSLE: 0.2800047155276033
Mean Residual Deviance: 4.93951995269894

ModelMetricsRegression: xgboost
** Reported on cross-validation data. **

MSE: 5.651764238637661
RMSE: 2.3773439462218464
MAE: 1.8244351430840293
RMSLE: 0.29415201167491384
Mean Residual Deviance: 5.651764238637661

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,1.8244352,0.017950913,1.8209916,1.8030126,1.8462918,1.8389,1.8129802
1,mean_residual_deviance,5.651765,0.10939602,5.618396,5.5470715,5.7881436,5.74533,5.5598836
2,mse,5.651765,0.10939602,5.618396,5.5470715,5.7881436,5.74533,5.5598836
3,r2,0.6204209,0.007870346,0.6227754,0.62522185,0.6106221,0.61400735,0.62947774
4,residual_deviance,5.651765,0.10939602,5.618396,5.5470715,5.7881436,5.74533,5.5598836
5,rmse,2.3772552,0.022979572,2.3703156,2.3552222,2.4058561,2.396942,2.3579404
6,rmsle,0.2941436,0.002488697,0.2919073,0.2934045,0.29718494,0.29634392,0.29187733



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2020-01-16 16:31:52,13 min 19.022 sec,0.0,9.23404,8.390619,85.267496
1,,2020-01-16 16:31:54,13 min 20.519 sec,5.0,7.409142,6.522079,54.895381
2,,2020-01-16 16:31:55,13 min 21.897 sec,10.0,6.016401,5.128018,36.197083
3,,2020-01-16 16:31:57,13 min 23.324 sec,15.0,4.985211,4.12059,24.852326
4,,2020-01-16 16:31:58,13 min 24.678 sec,20.0,4.256441,3.42605,18.117291
5,,2020-01-16 16:32:00,13 min 26.157 sec,25.0,3.727734,2.947659,13.896004
6,,2020-01-16 16:32:01,13 min 27.765 sec,30.0,3.361002,2.628137,11.296337
7,,2020-01-16 16:32:03,13 min 29.329 sec,35.0,3.10772,2.412894,9.657921
8,,2020-01-16 16:32:04,13 min 30.848 sec,40.0,2.937564,2.272391,8.62928
9,,2020-01-16 16:32:06,13 min 32.494 sec,45.0,2.819138,2.176334,7.94754



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,total_docks,4512015.0,1.0,0.281913
1,station_id,3669668.0,0.81331,0.229282
2,temp_float,2603438.0,0.577001,0.162664
3,hour,1687513.0,0.374004,0.105437
4,population,1346360.0,0.298394,0.084121
5,is_weekend.False,719009.9,0.159355,0.044924
6,start_count,412685.7,0.091464,0.025785
7,is_weekend.True,363730.5,0.080614,0.022726
8,end_count,316367.1,0.070117,0.019767
9,rain_identifier.false,280653.1,0.062201,0.017535




In [14]:
sc.stop()