In [1]:
# Import of required modules
import numpy as np 
import pandas as pd 

import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_181"; OpenJDK Runtime Environment (build 1.8.0_181-8u181-b13-2~deb9u1-b13); OpenJDK 64-Bit Server VM (build 25.181-b13, mixed mode)
  Starting server from /opt/conda/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp9fugs3ra
  JVM stdout: /tmp/tmp9fugs3ra/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp9fugs3ra/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.22.1.1
H2O cluster version age:,10 days
H2O cluster name:,H2O_from_python_unknownUser_2xicpd
H2O cluster total nodes:,1
H2O cluster free memory:,5.672 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [2]:
# Load prepared train data
train = h2o.import_file("../input/elo-feature-engineering/train.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [3]:
# Load prepared test data
test = h2o.import_file("../input/elo-feature-engineering/test.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
# Specify data types of features
for category in ["feature_1", "feature_2", "feature_3", "year", "weekofyear", "month"]:
    train[category] = train[category].asfactor()
    test[category] = test[category].asfactor()
    
predictors = train.columns
response = "target"
predictors.remove(response)
predictors.remove("card_id")
predictors.remove("first_active_month")

In [5]:
# Specify cross validation folds for Base Learners
nfolds = 6

## Training of Generalized Linear Model as Base Learner

In [6]:
# Inialize GLM Model
glm = H2OGeneralizedLinearEstimator(nfolds=nfolds,
                                    fold_assignment="Modulo",
                                    keep_cross_validation_predictions=True,
                                    seed=42,
                                    alpha=0.5,
                                    lambda_=0.5)
# Start Training
glm.train(x=predictors, y=response, training_frame=train)
# Output Summary
glm

glm Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  GLM_model_python_1546888541236_1


ModelMetricsRegressionGLM: glm
** Reported on train data. **

MSE: 14.657957716881588
RMSE: 3.8285712370127825
MAE: 1.5820778578914771
RMSLE: NaN
R^2: 0.011352720074814804
Mean Residual Deviance: 14.657957716881588
Null degrees of freedom: 201916
Residual degrees of freedom: 201915
Null deviance: 2993677.2278820523
Residual deviance: 2959690.84831958
AIC: 1115165.4121226156

ModelMetricsRegressionGLM: glm
** Reported on cross-validation data. **

MSE: 14.688586907553649
RMSE: 3.832569230627628
MAE: 1.581835951219728
RMSLE: NaN
R^2: 0.009286847964305789
Mean Residual Deviance: 14.688586907553649
Null degrees of freedom: 201916
Residual degrees of freedom: 201915
Null deviance: 2993753.841715544
Residual deviance: 2965875.40261251
AIC: 1115586.8966018101
Cross-Validation Metrics Summary: 

0,1,2,3,4,5,6,7,8
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid
mae,1.581836,0.0203363,1.5764347,1.5607811,1.6022342,1.628451,1.5383326,1.584782
mean_residual_deviance,14.688586,0.6209948,14.677058,14.209316,15.113588,16.225391,13.331208,14.574956
mse,14.688586,0.6209948,14.677058,14.209316,15.113588,16.225391,13.331208,14.574956
null_deviance,498958.97,20944.828,498384.97,482819.6,513362.34,550849.1,453261.03,495076.72
r2,0.0092557,0.0003340,0.0089441,0.0095686,0.0092110,0.0085077,0.0100180,0.0092848
residual_deviance,494312.56,20898.559,493927.03,478186.12,508617.6,546033.06,448635.16,490476.4
rmse,3.8308656,0.0807933,3.831065,3.7695246,3.88762,4.028075,3.6511927,3.8177161
rmsle,0.0,,,,,,,


Scoring History: 


0,1,2,3,4,5
,timestamp,duration,iterations,negative_log_likelihood,objective
,2019-01-07 19:15:59,0.000 sec,0,2993677.3300601,14.8262768




## Training of Distributed Random Forest as Base Learner

In [7]:
# Initialize DRF Model
rf = H2ORandomForestEstimator(nfolds=nfolds,
                              fold_assignment="Modulo",
                              keep_cross_validation_predictions=True,
                              seed=42)
# Start Training
rf.train(x=predictors, y=response, training_frame=train)
# Output Summary
rf

drf Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_model_python_1546888541236_2


ModelMetricsRegression: drf
** Reported on train data. **

MSE: 14.251576776447864
RMSE: 3.7751260610008592
MAE: 1.6717040629672444
RMSLE: NaN
Mean Residual Deviance: 14.251576776447864

ModelMetricsRegression: drf
** Reported on cross-validation data. **

MSE: 13.898325140297118
RMSE: 3.7280457535144493
MAE: 1.6385436562788875
RMSLE: NaN
Mean Residual Deviance: 13.898325140297118
Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7,8
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid
mae,1.6385437,0.0186309,1.6236911,1.6279423,1.6648855,1.6723722,1.5948086,1.6475625
mean_residual_deviance,13.898325,0.5586434,13.836195,13.758763,14.459031,15.068588,12.474749,13.792622
mse,13.898325,0.5586434,13.836195,13.758763,14.459031,15.068588,12.474749,13.792622
r2,0.0623494,0.0090544,0.0657226,0.0409735,0.0521213,0.0791969,0.0736191,0.0624630
residual_deviance,13.898325,0.5586434,13.836195,13.758763,14.459031,15.068588,12.474749,13.792622
rmse,3.7265198,0.0754164,3.7197037,3.7092807,3.802503,3.881828,3.531961,3.713842
rmsle,0.0,,,,,,,


Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2019-01-07 19:43:41,27 min 41.389 sec,0.0,,,
,2019-01-07 19:43:46,27 min 46.549 sec,1.0,5.1710716,2.0660715,26.7399812
,2019-01-07 19:43:54,27 min 54.711 sec,3.0,4.7390326,1.9396305,22.4584302
,2019-01-07 19:43:59,27 min 59.445 sec,4.0,4.6469042,1.9258547,21.5937183
,2019-01-07 19:44:03,28 min 3.597 sec,5.0,4.5793262,1.9132777,20.9702289
---,---,---,---,---,---,---
,2019-01-07 19:46:48,30 min 48.292 sec,43.0,3.7894312,1.6807265,14.3597889
,2019-01-07 19:46:56,30 min 55.959 sec,45.0,3.7841176,1.6772831,14.3195463
,2019-01-07 19:47:04,31 min 4.789 sec,47.0,3.7808008,1.6754934,14.2944545



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
weekofyear,3983742.2500000,1.0,0.0577982
hist_month_diff_mean,2520581.5,0.6327170,0.0365699
month,2128703.7500000,0.5343478,0.0308844
hist_authorized_flag_mean,1721233.0,0.4320643,0.0249726
hist_category_1_sum,1558961.0,0.3913308,0.0226182
---,---,---,---
new_category_3_A_mean,172282.0,0.0432463,0.0024996
new_category_2_2_mean,166570.6406250,0.0418126,0.0024167
new_purchase_month_nunique,120683.8828125,0.0302941,0.0017509



See the whole table with table.as_data_frame()




In [8]:
gbm = H2OGradientBoostingEstimator(nfolds=nfolds,
                                   fold_assignment="Modulo",
                                   keep_cross_validation_predictions=True,
                                   seed=42,
                                   max_depth = 6,
                                   ntrees = 10000,
                                   learn_rate=0.01,
                                   learn_rate_annealing=0.999,
                                   stopping_rounds = 5,
                                   stopping_tolerance = 0.001,
                                   stopping_metric = "rmse",
                                   sample_rate = 0.8,
                                   col_sample_rate = 0.8,
                                   score_tree_interval = 10)
gbm.train(x=predictors, y=response, training_frame=train)
gbm

gbm Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1546888541236_3


ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 12.859487099150435
RMSE: 3.5860127020341737
MAE: 1.5416103356588835
RMSLE: NaN
Mean Residual Deviance: 12.859487099150435

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 13.504546072679164
RMSE: 3.6748532042353967
MAE: 1.564044290444925
RMSLE: NaN
Mean Residual Deviance: 13.504546072679164
Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7,8
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid,cv_6_valid
mae,1.5640444,0.0184761,1.5589854,1.5495439,1.5860294,1.6010317,1.5195141,1.5691615
mean_residual_deviance,13.504545,0.5703331,13.43026,13.37311,14.004401,14.745635,12.061675,13.412193
mse,13.504545,0.5703331,13.43026,13.37311,14.004401,14.745635,12.061675,13.412193
r2,0.0890768,0.0084026,0.0931331,0.0678547,0.0819251,0.0989318,0.1042942,0.0883222
residual_deviance,13.504545,0.5703331,13.43026,13.37311,14.004401,14.745635,12.061675,13.412193
rmse,3.6731944,0.0780653,3.6647317,3.6569262,3.7422454,3.8400044,3.4729922,3.6622663
rmsle,0.0,,,,,,,


Scoring History: 


0,1,2,3,4,5,6
,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2019-01-07 20:11:27,24 min 10.445 sec,0.0,3.8504904,1.5897828,14.8262763
,2019-01-07 20:11:33,24 min 16.314 sec,10.0,3.8206558,1.5746784,14.5974108
,2019-01-07 20:11:39,24 min 22.015 sec,20.0,3.7954829,1.5640910,14.4056906
,2019-01-07 20:11:44,24 min 27.658 sec,30.0,3.7738085,1.5562417,14.2416304
,2019-01-07 20:11:50,24 min 33.291 sec,40.0,3.7553723,1.5506425,14.1028214
---,---,---,---,---,---,---
,2019-01-07 20:14:16,26 min 59.593 sec,290.0,3.5939114,1.5418988,12.9161995
,2019-01-07 20:14:22,27 min 5.509 sec,300.0,3.5916069,1.5419386,12.8996404
,2019-01-07 20:14:28,27 min 11.468 sec,310.0,3.5893296,1.5418357,12.8832867



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
hist_month_diff_mean,3248946.0,1.0,0.1637460
hist_category_1_sum,1902634.3750000,0.5856159,0.0958923
weekofyear,1769789.0,0.5447271,0.0891969
new_month_lag_mean,1178773.3750000,0.3628172,0.0594099
hist_purchase_month_nunique,1072845.5,0.3302134,0.0540711
---,---,---,---
hist_purchase_amount_max,2437.5871582,0.0007503,0.0001229
new_category_2_2_mean,2362.5808105,0.0007272,0.0001191
new_category_2_5_sum,902.0308228,0.0002776,0.0000455



See the whole table with table.as_data_frame()




In [9]:
ensemble = H2OStackedEnsembleEstimator(base_models=[glm, rf, gbm])
ensemble.train(x=predictors, y=response, training_frame=train)


stackedensemble Model Build progress: |███████████████████████████████████| 100%


In [13]:
# Output Summary
ensemble

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_model_python_1546888541236_4
No model summary for this model


ModelMetricsRegressionGLM: stackedensemble
** Reported on train data. **

MSE: 11.997079353114868
RMSE: 3.4636800304177733
MAE: 1.521865343346612
RMSLE: NaN
R^2: 0.19082316250350317
Mean Residual Deviance: 11.997079353114868
Null degrees of freedom: 201916
Residual degrees of freedom: 201914
Null deviance: 2993677.2278820523
Residual deviance: 2422414.271742895
AIC: 1074719.3696205097




In [14]:
# Predict target variable for test data
predictions = ensemble.predict(test)

stackedensemble prediction progress: |████████████████████████████████████| 100%


In [12]:
# Create submission dataframe and csv export file
sub_df = pd.DataFrame({"card_id": test["card_id"].as_data_frame().card_id})
sub_df["target"] = predictions.as_data_frame()   
sub_df.to_csv("submit.csv", index=False)