## Model Building in H2O

I will go through 4 H2O  models including linear GLM, GBM, DRF (Distributed Random Forest) and DL (Deep Learning NN).

I'll use H2OFlow for the hyperparameters searching (it's just easier than writing code) and post here the best parameters found.


# H2O - GLM, GBM, NN, RF

In [1]:
import pandas as pd
import numpy as np
import time
import csv
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 8
import math

_start_time = time.time()

def tic():
    global _start_time 
    _start_time = time.time()

def tac():
    t_sec = round(time.time() - _start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))

In [2]:
import h2o
import time
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [4]:
# Connect to a cluster
h2o.init()

Connecting to H2O server at http://localhost:54321... successful!


0,1
H2O cluster uptime:,14 hours 55 mins
H2O cluster version:,3.10.0.1
H2O cluster version age:,13 days
H2O cluster name:,LOCAL SERVICE
H2O cluster total nodes:,1
H2O cluster free memory:,49.30 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster is healthy:,True
H2O cluster is locked:,True


In [None]:
#now we load our modified train and test set
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

tic()
train = h2o.upload_file(path=_locate("./input-data/train_modified_noW9.csv"))
val = h2o.upload_file(path=_locate("./input-data/val_modified_w9.csv"))
test = h2o.upload_file(path=_locate("./input-data/test_modified.csv"))
tac()

In [None]:
#   RSMLE - error function used in LB
# H2O python API recently (Jun 2016) added RSME as a model performance metric. So we are going to use it directly
# into our target = log_target , to get the RSMLE

def modelfit(alg, dtrain, dval, dtest, predictors, target, IDcol, filename):
     #Fit the algorithm on the data
    alg.train(x               =predictors,
             y               =target,
             training_frame  =dtrain,
             validation_frame=dval)

    #Performance on Training and Val sets:
    train_rmsle = alg.model_performance(train).rmse()
    val_rmsle = alg.model_performance(val).rmse()
    
    print ("\nModel Report")
    print ('RMSLE TRAIN: ', train_rmsle)
    print ('RMSLE VAL: ', val_rmsle)
    
    #Predict on testing data: we need to revert it back to target by applying expm1
    dtest[target] = np.expm1(h2o.predict(alg,dtest[predictors]))

    print ('NUM ROWS PREDICTED: ', dtest.shape[0] )
    print ('NUM NEGATIVES PREDICTED: ', dtest[target][dtest[target] < 0].count())
    print ('MIN TARGET PREDICTED: ', dtest[target].min())
    print ('MEAN TARGET PREDICTED: ', dtest[target].mean())
    print ('MAX TARGET PREDICTED: ', dtest[target].max())
    
    #Export submission file:
    #IDcol.append(target)
    #submission = pd.DataFrame({ x: dtest[x] for x in IDcol})
    submission = dtest[[IDcol,target]].copy()
    submission[IDcol] = submission[IDcol].astype(int)
    submission.rename(columns={target: 'Demanda_uni_equil'}, inplace=True)
    submission.to_csv("./Submissions/"+filename, index=False)

Let's define now the target and the Id cols

In [None]:
#Define target and ID columns:
target = 'log_target'
IDcol = 'id'

### Alg6 - GBM

Lets make our first GBM model

In [None]:
predictors = train.names
predictors.remove(target) # here we remove the target

# print predictors
alg6 = H2OGradientBoostingEstimator(ntrees=150,max_depth=25,learn_rate=0.1, min_rows=10, nbins=20)
tic()
modelfit(alg6, train, val, test, predictors, target, IDcol, 'alg6.csv')
tac()

coef1 = pd.Series(alg6.varimp(use_pandas=True)).sort_values()
coef1.plot(kind='bar', title='Model Coefficients')

In [7]:
# ----------
# Fit a model on train; using test and validation

# Function for doing class test/train/holdout split
def fit_score(data):
  global gbm0,drf0,glm0,dl0

  val = data[data["C1"] > 8] # Weeks 9
  train = data[data["C1"] <=8] # Weeks 7,8

  print("Training data has",train.ncol,"columns and",train.nrow,"rows, Validation data has",val.nrow,"rows")
  predictors_names = train.names
  predictors_names.remove("C7")

  # Run GBM
  s = time.time()
  
  gbm0 = H2OGradientBoostingEstimator(ntrees=50, # 500 works well
                                      max_depth=15,
                                      learn_rate=0.1)
    

  gbm0.train(x               =predictors_names,
             y               ="C7",
             training_frame  =train,
             validation_frame=val)

  gbm_elapsed = time.time() - s

  # Run DRF
  s = time.time()
    
  drf0 = H2ORandomForestEstimator(ntrees=20, max_depth=30)

  drf0.train(x               =predictors_names,
             y               ="C7",
             training_frame  =train,
             validation_frame=val)
    
  drf_elapsed = time.time() - s 
    
  
  # Run GLM
  s = time.time()

  glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family="poisson")
    
  glm0.train(x               =predictors_names,
             y               ="C7",
             training_frame  =train,
             validation_frame=val)

  glm_elapsed = time.time() - s
 
  # Run DL
  s = time.time()

  dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)
    
  dl0.train(x               =predictors_names,
            y               ="C7",
            training_frame  =train,
            validation_frame=val)
    
  dl_elapsed = time.time() - s
  
  # ----------
  # 4- Score on holdout set & report
  train_r2_gbm = gbm0.model_performance(train).rmse()
  val_r2_gbm  = gbm0.model_performance(val).rmse()
  
  train_r2_drf = drf0.model_performance(train).rmse()
  val_r2_drf  = drf0.model_performance(val).rmse()
 
  train_r2_glm = glm0.model_performance(train).rmse()
  val_r2_glm  = glm0.model_performance(val).rmse()
  
  train_r2_dl = dl0.model_performance(train).rmse()
  val_r2_dl  = dl0.model_performance(val).rmse()
    
  # make a pretty HTML table printout of the results

  header = ["Model", "RMSLE TRAIN", "RMSLE TEST", "Model Training Time (s)"]
  table  = [
            ["GBM", train_r2_gbm, val_r2_gbm, round(gbm_elapsed,3)],
            ["DRF", train_r2_drf, val_r2_drf, round(drf_elapsed,3)],
            ["GLM", train_r2_glm, val_r2_glm, round(glm_elapsed,3)],
            ["DL ", train_r2_dl,  val_r2_dl,  round(dl_elapsed,3) ],
           ]
  h2o.display.H2ODisplay(table,header)
  # --------------

In [8]:
# Split the data (into test & train), fit some models and predict on the holdout data
fit_score(train_hex)

('Training data has', 17, 'columns and', 328667, 'rows, Validation data has', 171333, 'rows')






0,1,2,3
Model,R2 TRAIN,R2 TEST,Model Training Time (s)
GBM,0.2866214,0.2817645,687.12
DRF,0.1265818,0.2977402,1401.046
GLM,0.4239004,0.4147631,5.4
DL,0.2916710,0.2937356,295.847
