## Neural Network Model Building in H2O

I will go through 2 H2O  models including  GBM, and DL (Deep Learning NN).

I'll use H2OFlow for the hyperparameters searching (it's just easier than writing code) and post here the best parameters found.


# H2O

In [1]:
import pandas as pd
import numpy as np
import time
import csv
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 8
import math

_start_time = time.time()

def tic():
    global _start_time 
    _start_time = time.time()

def tac():
    t_sec = round(time.time() - _start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))

In [2]:
import h2o
import time
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

In [3]:
# Connect to a cluster
h2o.init()

Connecting to H2O server at http://localhost:54321....... failed.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_91"; OpenJDK Runtime Environment (build 1.8.0_91-b14); OpenJDK 64-Bit Server VM (build 25.91-b14, mixed mode)
  Starting server from /anaconda/envs/py35/h2o_jar/h2o.jar
  Ice root: /tmp/tmp62r9u06d
  JVM stdout: /tmp/tmp62r9u06d/h2o_dsvm_started_from_python.out
  JVM stderr: /tmp/tmp62r9u06d/h2o_dsvm_started_from_python.err
Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful!


In [5]:
#Let's define variables that will define the behaviour of the whole script
s3_path = 'http://bbts-kaggle.s3.amazonaws.com/bimbo/Pablo/'
use_validation=True # splits train data into train + val sets
val_week_threshold = 8 # (possible values 8 or 9)  - weeks 3,4,5,6,7 are train, and week 8.9 are val
trimmed = True # removes weeks which doesn't have all the lags. If False, fills empty lags with 0
lag = 5  # shifted mean_demand up to "lag" weeks
if (val_week_threshold == 8): lag = 4

In [None]:
#now we load our modified train and test set
tic()
sufix=""
sufix=""
if (use_validation): 
    sufix += "_holdout"
    sufix += repr(val_week_threshold)
if (trimmed): sufix += "_trimmed"

train_csv = s3_path +"train_modified"+sufix+".csv"
val_csv = s3_path +"val_modified"+sufix+".csv"
test_csv = s3_path +"test_modified"+sufix+".csv"

tic()
print ('Downloading File: {} ...'.format(train_csv))
train = h2o.import_file(train_csv)

if (use_validation):
    print ('Downloading File: {} ...'.format(val_csv))
    val = h2o.import_file(val_csv)
    
print ('Downloading File: {} ...'.format(test_csv))
test = h2o.import_file(test_csv)
tac()

Downloading File: http://bbts-kaggle.s3.amazonaws.com/bimbo/Pablo/jorge_train.csv ...


In [None]:
train.show()

In [73]:
#Define target and ID columns:
target = 'log_target'
IDcol = 'id'

In [72]:
# H2O python API recently (Jun 2016) added RSME as a model performance metric. So we are going to use it directly
# into our target = log_target , to get the RSMLE

def modelfit(alg, dtrain, dval, dtest, predictors, target, IDcol, filename):   
    #Fit the algorithm on the data
    alg.train(x=predictors, y=target, training_frame=dtrain, validation_frame=dval)
    
    #Performance on Training and Val sets:
    print ("\nModel Report")
    print ('RMSLE TRAIN: ', alg.model_performance(train).rmse())
    print ('RMSLE VAL: ', alg.model_performance(val).rmse())
 
    #Predict on testing data: we need to revert it back to "Demanda_uni_equil" by applying expm1 
    dtest[target] = alg.predict(dtest[predictors]).expm1()
    
    print ('NUM ROWS PREDICTED: ', dtest.shape[0] )
    print ('MIN TARGET PREDICTED: ', dtest[target].min())
    print ('MEAN TARGET PREDICTED: ', dtest[target].mean())
    print ('MAX TARGET PREDICTED: ', dtest[target].max())
    
    
    #Export submission file:
    submission = dtest[[IDcol,target]].as_data_frame(use_pandas=True)
    submission[target] = np.maximum(submission[target], 0) # we make all negative numbers = 0 since there cannot be a negative demand
    submission[IDcol] = submission[IDcol].astype(int)
    submission.rename(columns={target: 'Demanda_uni_equil'}, inplace=True)
    submission.to_csv("./Submissions/"+filename, index=False)

Let's define now the target and the Id cols

In [74]:
#H@O would automatically hot-encode the categorical values (Genius!), but we must specify what columns are categoricals
train['brand'] = train['brand'].asfactor()
train['prodtype_cluster'] = train['prodtype_cluster'].asfactor()
train['ZipCode'] = train['ZipCode'].asfactor()
train['week_ct'] = train['week_ct'].asfactor()
train['NombreCliente'] = train['NombreCliente'].asfactor()
train['Producto_ID_clust_ID'] = train['Producto_ID_clust_ID'].asfactor()
train['Ruta_SAK_clust_ID'] = train['Ruta_SAK_clust_ID'].asfactor()
train['Agencia_ID_clust_ID'] = train['Agencia_ID_clust_ID'].asfactor()
train['Cliente_ID_clust_ID'] = train['Cliente_ID_clust_ID'].asfactor()


In [None]:
# In case there is no validation, we make val = train
if not (use_validation):
    val = train

### Alg8 - GBM

Lets make our first GBM model

In [None]:
predictors = ['Canal_ID', 'Log_Target_mean_lag1', 'Log_Target_mean_lag2', 'Log_Target_mean_lag3', 'Log_Target_mean_lag4', 
              'Agencia_ID','Ruta_SAK','Cliente_ID','Producto_ID',
              'Lags_sum', 'brand', 'prodtype_cluster', 'Qty_Ruta_SAK_Bin', 'ZipCode', 'Producto_ID_clust_ID']


model = H2OGradientBoostingEstimator(ntrees=300,max_depth=10,learn_rate=0.1, min_rows=10, nbins=40, sample_rate=0.7,
                                    col_sample_rate=0.7, stopping_rounds=10, stopping_metric="MSE", stopping_tolerance=0.01)
tic()
modelfit(model, train, val, test, predictors, target, IDcol, 'alg8_{}.csv'.format(sufix))
tac()

model.varimp(use_pandas=True)
#Plot Coeficients importance
#coef = pd.Series(model.feature_importances_, predictors).sort_values(ascending=False)
#coef.plot(kind='bar', title='Feature Importances')

#Plot Histogram of target and prediction distributions
model_history_df = model.scoring_history()
model_history_df
plt.plot(model_history_df['training_rmse'], label="training_rmse")
plt.plot(model_history_df['validation_rmse'], label="validation_rmse")
plt.title("Deep Learner .. )")
plt.legend();

## --> LB: 

### Alg9 - Deep Learning

Now we try with a Deep Learning Network. To improve generalization we added dropout and L1 and L2 penalties.

In [None]:
predictors = ['Canal_ID', 'Log_Target_mean_lag1', 'Log_Target_mean_lag2', 'Log_Target_mean_lag3', 'Log_Target_mean_lag4', 
              'Agencia_ID','Ruta_SAK','Cliente_ID','Producto_ID',
              'Lags_sum', 'brand', 'prodtype_cluster', 'Qty_Ruta_SAK_Bin', 'ZipCode', 'Producto_ID_clust_ID']

model = H2ODeepLearningEstimator(activation="Rectifier", hidden=[100,100,100], epochs=35,
                                standardize=False, score_interval=10, stopping_rounds=10, stopping_metric="MSE",
                                stopping_tolerance=0.01, use_all_factor_levels=False)
    
tic()
modelfit(model, train, val, test, predictors, target, IDcol, 'alg9_{}.csv'.format(sufix))
tac()

model.varimp(use_pandas=True)
#Plot Coeficients importance
#coef = pd.Series(model.feature_importances_, predictors).sort_values(ascending=False)
#coef.plot(kind='bar', title='Feature Importances')

#Plot Histogram of target and prediction distributions
model_history_df = model.scoring_history()
model_history_df
plt.plot(model_history_df['training_rmse'], label="training_rmse")
plt.plot(model_history_df['validation_rmse'], label="validation_rmse")
plt.title("Deep Learner .. )")
plt.legend();