## Neural Network Model Building in H2O

I will go through 2 H2O  models including  GBM, and DL (Deep Learning NN).

I'll use H2OFlow for the hyperparameters searching (it's just easier than writing code) and post here the best parameters found.


# H2O

In [1]:
import pandas as pd
import numpy as np
import time
import csv
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 8
import math

_start_time = time.time()

def tic():
    global _start_time 
    _start_time = time.time()

def tac():
    t_sec = round(time.time() - _start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))

In [2]:
import h2o
import time
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

In [29]:
# Connect to a cluster
h2o.init()

Connecting to H2O server at http://localhost:54321... successful!


0,1
H2O cluster uptime:,13 secs
H2O cluster version:,3.10.0.4
H2O cluster version age:,26 days
H2O cluster name:,H2O_from_python_dsvm_gxs5sk
H2O cluster total nodes:,1
H2O cluster free memory:,24.48 Gb
H2O cluster total cores:,16
H2O cluster allowed cores:,16
H2O cluster is healthy:,True
H2O cluster is locked:,False


In [30]:
#Let's define variables that will define the behaviour of the whole script
s3_path = 'http://bbts-kaggle.s3.amazonaws.com/bimbo/Pablo/'
use_validation=True # splits train data into train + val sets
val_week_threshold = 8 # (possible values 8 or 9)  - weeks 3,4,5,6,7 are train, and week 8.9 are val
trimmed = True # removes weeks which doesn't have all the lags. If False, fills empty lags with 0
lag = 5  # shifted mean_demand up to "lag" weeks
if (val_week_threshold == 8): lag = 4

In [6]:
#now we load our modified train and test set
tic()
sufix=""
sufix=""
if (use_validation): 
    sufix += "_holdout"
    sufix += repr(val_week_threshold)
if (trimmed): sufix += "_trimmed"

#train_csv = s3_path +"train_modified"+sufix+".csv"
#val_csv = s3_path +"val_modified"+sufix+".csv"
#test_csv = s3_path +"test_modified"+sufix+".csv"

train_csv = s3_path +"jorge_train.csv"
test_csv = s3_path +"jorge_test.csv"
val_csv = s3_path +"jorge_validation.csv"

tic()
print ('Downloading File: {} ...'.format(train_csv))
train = h2o.import_file(train_csv)

if (use_validation):
    print ('Downloading File: {} ...'.format(val_csv))
    val = h2o.import_file(val_csv)
    
print ('Downloading File: {} ...'.format(test_csv))
test = h2o.import_file(test_csv)
tac()

Downloading File: http://bbts-kaggle.s3.amazonaws.com/bimbo/Pablo/jorge_train.csv ...

Downloading File: http://bbts-kaggle.s3.amazonaws.com/bimbo/Pablo/jorge_test.csv ...

Downloading File: http://bbts-kaggle.s3.amazonaws.com/bimbo/Pablo/jorge_validation.csv ...

Time passed: 0hour:5min:22sec


In [27]:
test.show()

C1,Agencia_ID,Canal_ID,Cliente_ID,Demanda_uni_equil,Producto_ID,Ruta_SAK,Semana,ZipCode,id,Last_per_Cliente_ID,Last_per_Ruta_SAK,week_ct,Log_Target_mean_lag1,Log_Target_mean_lag2,Log_Target_mean_lag3,Log_Target_mean_lag4,Lags_sum,brand,Qty_Ruta_SAK_Bin,num_prod,num_prod_uni
137,1110,7,15766,0.693147,1212,3301,9,2008,0,1.60944,1.51731,1,1.60944,0.0,0.0,1.79176,3.4012,2,1,23,23
138,1110,7,15766,1.09861,1238,3301,9,2008,0,1.38629,1.57141,1,1.38629,1.09861,1.09861,0.693147,4.27667,2,1,23,23
139,1110,7,15766,1.09861,1240,3301,9,2008,0,1.09861,1.85393,1,1.09861,2.19722,0.0,0.0,3.29584,2,1,23,23
140,1110,7,15766,0.693147,1242,3301,9,2008,0,0.693147,1.98416,1,0.693147,1.09861,1.38629,1.09861,4.27667,2,1,23,23
141,1110,7,15766,2.3979,1250,3301,9,2008,0,2.19722,1.97041,1,2.19722,2.70805,0.693147,2.19722,7.79565,2,1,23,23
142,1110,7,15766,1.38629,1309,3301,9,2008,0,1.38629,1.55675,1,1.38629,2.30259,1.94591,1.94591,7.5807,2,1,23,23
143,1110,7,15766,2.63906,3894,3301,9,2008,0,1.09861,1.85735,1,1.09861,1.60944,1.60944,2.19722,6.51471,14,1,23,23
144,1110,7,15766,1.94591,5310,3301,9,2008,0,2.63906,2.08284,1,0.0,0.0,2.63906,2.77259,5.41165,14,1,23,23
145,1110,7,15766,1.09861,5350,3301,9,2008,0,1.09861,1.21025,1,0.0,0.0,0.0,0.0,0.0,14,1,23,23
146,1110,7,15766,1.09861,5354,3301,9,2008,0,1.60944,1.08234,1,0.0,0.0,0.0,0.0,0.0,14,1,23,23


In [9]:
#Define target and ID columns:
target = 'Demanda_uni_equil'
IDcol = 'id'

In [12]:
# H2O python API recently (Jun 2016) added RSME as a model performance metric. So we are going to use it directly
# into our target = log_target , to get the RSMLE

def modelfit(alg, dtrain, dval, dtest, predictors, target, IDcol, filename):   
    #Fit the algorithm on the data
    alg.train(x=predictors, y=target, training_frame=dtrain, validation_frame=dval)
    
    #Performance on Training and Val sets:
    print ("\nModel Report")
    print ('RMSLE TRAIN: ', alg.model_performance(train).rmse())
    print ('RMSLE VAL: ', alg.model_performance(val).rmse())
 
    #Predict on testing data: we need to revert it back to "Demanda_uni_equil" by applying expm1 
    dtest[target] = alg.predict(dtest[predictors]).expm1()
    
    print ('NUM ROWS PREDICTED: ', dtest.shape[0] )
    print ('MIN TARGET PREDICTED: ', dtest[target].min())
    print ('MEAN TARGET PREDICTED: ', dtest[target].mean())
    print ('MAX TARGET PREDICTED: ', dtest[target].max())
    
    
    #Export submission file:
    submission = dtest[[IDcol,target]].as_data_frame(use_pandas=True)
    submission[target] = np.maximum(submission[target], 0) # we make all negative numbers = 0 since there cannot be a negative demand
    submission[IDcol] = submission[IDcol].astype(int)
    submission.rename(columns={target: 'Demanda_uni_equil'}, inplace=True)
    submission.to_csv("./Submissions/"+filename, index=False)

Let's define now the target and the Id cols

In [13]:
#H@O would automatically hot-encode the categorical values (Genius!), but we must specify what columns are categoricals
train['brand'] = train['brand'].asfactor()
train['ZipCode'] = train['ZipCode'].asfactor()
train['week_ct'] = train['week_ct'].asfactor()
train['Qty_Ruta_SAK_Bin'] = train['week_ct'].asfactor()

In [14]:
# In case there is no validation, we make val = train
if not (use_validation):
    val = train

### Alg11 - GBM

Lets make our first GBM model

In [None]:
predictors = ['Canal_ID', 'Log_Target_mean_lag1', 'Log_Target_mean_lag2', 'Log_Target_mean_lag3', 'Log_Target_mean_lag4',
              'Last_per_Cliente_ID', 'Last_per_Ruta_SAK',
              'Lags_sum', 'brand', 'week_ct', 'Qty_Ruta_SAK_Bin', 'ZipCode', 'num_prod', 'num_prod_uni']


model = H2OGradientBoostingEstimator(ntrees=500,max_depth=10,learn_rate=0.1, min_rows=10, nbins=40, sample_rate=0.7,
                                    col_sample_rate=0.7, stopping_rounds=10, stopping_metric="MSE", stopping_tolerance=0.01)
tic()
modelfit(model, train, val, test, predictors, target, IDcol, 'alg11_{}-JorgeGBM.csv'.format(sufix))
tac()

model.varimp(use_pandas=True)
#Plot Coeficients importance
#coef = pd.Series(model.feature_importances_, predictors).sort_values(ascending=False)
#coef.plot(kind='bar', title='Feature Importances')

#Plot Histogram of target and prediction distributions
model_history_df = model.scoring_history()
model_history_df
plt.plot(model_history_df['training_rmse'], label="training_rmse")
plt.plot(model_history_df['validation_rmse'], label="validation_rmse")
plt.title("GBM .. )")
plt.legend();

## --> LB: 

### Alg9 - Deep Learning

Now we try with a Deep Learning Network. To improve generalization we added dropout and L1 and L2 penalties.

In [None]:
predictors = ['Canal_ID', 'Log_Target_mean_lag1', 'Log_Target_mean_lag2', 'Log_Target_mean_lag3', 'Log_Target_mean_lag4',
              'Last_per_Cliente_ID', 'Last_per_Ruta_SAK',
              'Lags_sum', 'brand', 'week_ct', 'Qty_Ruta_SAK_Bin', 'ZipCode', 'num_prod', 'num_prod_uni']


model = H2ODeepLearningEstimator(activation="Rectifier", hidden=[100,100,100], epochs=35,
                                standardize=True, score_interval=10, stopping_rounds=10, stopping_metric="MSE",
                                stopping_tolerance=0.01, use_all_factor_levels=False)
    
tic()
modelfit(model, train, val, test, predictors, target, IDcol, 'alg9_{}.csv'.format(sufix))
tac()

model.varimp(use_pandas=True)
#Plot Coeficients importance
#coef = pd.Series(model.feature_importances_, predictors).sort_values(ascending=False)
#coef.plot(kind='bar', title='Feature Importances')

#Plot Histogram of target and prediction distributions
model_history_df = model.scoring_history()
model_history_df
plt.plot(model_history_df['training_rmse'], label="training_rmse")
plt.plot(model_history_df['validation_rmse'], label="validation_rmse")
plt.title("Deep Learner .. )")
plt.legend();