In [None]:
import h2o
import numpy as np
import pandas as pd

np.random.seed(123)
N = 1000

sex = np.array( [ 'Male', 'Female'] ) #Creating the sex column. (Differs from the video)
d = pd.DataFrame( { 'id':(range(N)) } )

d = d.assign( sex = sex[ d.id.values % len(sex) ] )

d = d.assign( age = np.random.uniform( 18, 65, N ).round() ) #Age ranges from 18 to 65.

#Creating the healthyEating column.
v = np.random.normal( 5, 2, N ).round()
v = np.where( v > 9, 9, v ) #Setting the maximum.
v = np.where( v < 0, 0, v ) #Setting the minimum.
d = d.assign( healthyEating = v )

#Creating the activeLifestyle column.
v = np.random.normal( 5, 2, N ).round()
v = np.where( v > 9, 9, v ) #Setting the maximum.
v = np.where( v < 0, 0, v ) #Setting the minimum.
d = d.assign( activeLifestyle = v )

#Creating the income column.
v = 20000 + ( d.age.values*3 )**2
v += d.healthyEating*500
v -= d.activeLifestyle*300
v += np.random.uniform( 0, 5000, N )
d = d.assign( income = v.round(2) ) #Round up the income to the nearest 100.

h2o.init()

#Naming the data frame 'people2'.
people2 = h2o.H2OFrame( d, destination_frame = 'people2' )

people2 = h2o.get_frame( "people2" )

train, test = people2.split_frame(
    ratios = [0.897],
    destination_frames = [ "people2_train", "people2_test" ],
    seed = 123
    )

from h2o.estimators.gbm import H2OGradientBoostingEstimator

#Predicting the "age" column.
y = "income"
ignoreFields = [ y, "id"]
x = [ i for i in train.names if i not in ignoreFields ]

#Building GBM model.
m3 = H2OGradientBoostingEstimator( model_id = "def9folds", nfolds = 6 )
m3.train( x, y, train )

m3.mae( train = True ) #Training.
m3.mae( xval = True )  #Validating.
perf = m3.model_performance( test ) #Testing.
perf.mae()

#Overfitting the model by increasing the ntrees, max_depth, and learn_rate.
m4 = H2OGradientBoostingEstimator( model_id = "overfit_model",
                                                         ntrees = 1000,
                                                         max_depth = 10,
                                                         learn_rate = 0.9,
                                                         nfolds = 6)
m4.train( x, y, train )

print( "Train: %d  - ->  %d" % (m3.mae(train = True), m4.mae(train = True) ))
print( "Valid: %d  - ->  %d" % (m3.mae(xval = True), m4.mae(xval = True) ))
print( " Test: %d  - ->  %d" % (perf.mae(), m4.model_performance(test).mae() ))