In [1]:
#Responsible Machine Learning Class by Professor Hall
#Assignment1 h2o monotonic gradient boosting machine

import datetime                                               # for timestamp 
import time                                                   # for timers

import h2o                                                    # base h2o package for python
from h2o.estimators.gbm import H2OGradientBoostingEstimator   # h2o GBM
from h2o.grid.grid_search import H2OGridSearch                # h2o grid search
import math                                                   # math functions

import matplotlib.pyplot as plt                               # basic plotting
import numpy as np                                            # for basic array manipulation                            
import pandas as pd                                           # for dataframe manipulation


# set numpy random seed for better reproducibility
SEED = 12345 

# set number of threads
NTHREAD = 4

h2o.init(max_mem_size='6G', nthreads=NTHREAD) # start h2o with plenty of memory and threads
h2o.remove_all()                              # clears h2o memory
h2o.no_progress()                             # turn off h2o progress indicators    

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 day 5 hours 28 mins
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.3
H2O_cluster_version_age:,11 days
H2O_cluster_name:,H2O_from_python_minhyekim_9me9p1
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,5.961 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [2]:
tic = time.time()

In [3]:
data = pd.read_csv('hmda_train_preprocessed.csv')
test = pd.read_csv('hmda_test_preprocessed.csv')

In [4]:
data

Unnamed: 0,row_id,black,asian,white,amind,hipac,hispanic,non_hispanic,male,female,...,conforming,debt_to_income_ratio_missing,loan_amount_std,loan_to_value_ratio_std,no_intro_rate_period_std,intro_rate_period_std,property_value_std,income_std,debt_to_income_ratio_std,high_priced
0,0,,,,,,,,1.0,0.0,...,1,0,-0.514393,0.333922,0.244394,-0.215304,-0.535932,-0.040307,0.854601,0
1,1,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,...,1,0,-0.118642,0.268727,0.244394,-0.215304,-0.227585,-0.018133,-0.425131,0
2,2,,,,,,,,,,...,1,0,-0.778227,0.228996,-4.091747,4.610857,-0.720941,-0.032338,0.123326,0
3,3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,,...,1,0,-0.074670,-1.150240,0.244394,-0.215304,0.358276,-0.018133,-0.425131,0
4,4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,,,...,1,0,-0.602338,0.552520,0.244394,-0.215304,-0.628437,-0.038228,0.763191,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
160333,160333,,,,,,,,,,...,1,0,0.365054,0.663316,0.244394,-0.215304,0.019094,-0.015361,1.585876,0
160334,160334,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1,0,-0.558366,0.552520,0.244394,-0.215304,-0.597602,-0.039268,1.585876,0
160335,160335,,,,,,,,,,...,0,0,9.599253,-0.463082,-4.091747,2.197776,9.084518,0.270486,0.397554,0
160336,160336,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1,0,-0.998089,0.552520,0.244394,-0.215304,-0.905950,-0.039614,-0.425131,0


In [5]:
y_name = 'high_priced'
x_names = ['term_360', 'conforming', 'debt_to_income_ratio_missing', 'loan_amount_std', 'loan_to_value_ratio_std', 'no_intro_rate_period_std',
           'intro_rate_period_std', 'property_value_std', 'income_std', 'debt_to_income_ratio_std']

In [6]:
np.random.seed(SEED) # preserve exact reproducibility for this cell

split_ratio = 0.7 # 70%/30% train/test split

# execute split
split = np.random.rand(len(data)) < split_ratio
train = data[split]
valid = data[~split]

# summarize split
print('Train data rows = %d, columns = %d' % (train.shape[0], train.shape[1]))
print('Validation data rows = %d, columns = %d' % (valid.shape[0], valid.shape[1]))

Train data rows = 112253, columns = 23
Validation data rows = 48085, columns = 23


In [7]:
# convert data to h2o frames
htrain = h2o.H2OFrame(train, column_types={'high_priced':'enum'})
hvalid = h2o.H2OFrame(valid, column_types={'high_priced':'enum'})

In [8]:
# start local timer
gbm_tic = time.time()

# set monotonic constraints for numerical columns
numeric_x_names = ['loan_amount_std', 'loan_to_value_ratio_std', 'intro_rate_period_std',
                   'property_value_std', 'income_std', 'debt_to_income_ratio_std']
mono = tuple([int(i) for i in np.sign(train[numeric_x_names + [y_name]].corr()[y_name].values[:-1])])
monotone_constraints = dict(zip(numeric_x_names,mono))
monotone_constraints

# train a mgbm model with initial settings
gbm = H2OGradientBoostingEstimator(distribution='bernoulli',
  ## more trees is better if the learning rate is small enough 
  ## here, use "more than enough" trees - we have early stopping
  ntrees = 10000,                                                            

  ## smaller learning rate is better (this is a good value for most datasets, but see below for annealing)
  learn_rate = 0.01,                                                         

  ## early stopping once the validation AUC doesn't improve by at least 0.01% for 5 consecutive scoring events
  stopping_rounds = 5, stopping_tolerance = 1e-4, stopping_metric = "AUC",                                                  

  ## sample 80% of columns per split
  col_sample_rate = 0.8,                                                   

  ## fix the seed for reproducibility
  seed = SEED,                                                             

  ## score every 10 trees to make early stopping reproducible (it depends on the scoring interval)
  score_tree_interval = 10,
                                  
  ## monotonic constraints
  monotone_constraints=monotone_constraints)

gbm.train(x=x_names, y=y_name, training_frame=htrain, validation_frame=hvalid)

# end local timer
gbm_toc = time.time() - gbm_tic

print('GBM training completed in %.2f s.' % (gbm_toc))

GBM training completed in 75.31 s.


In [9]:
print(gbm)

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1622312018704_15396


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,830.0,830.0,204654.0,0.0,5.0,4.7,1.0,30.0,14.871084




ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.07889051461523487
RMSE: 0.2808745531642816
LogLoss: 0.2645382971320806
Mean Per-Class Error: 0.25814148113413404
AUC: 0.8019360168037477
AUCPR: 0.2555770986465979
Gini: 0.6038720336074954

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.15173930204873493: 


Unnamed: 0,Unnamed: 1,0.0,1.0,Error,Rate
0,0.0,79205.0,22121.0,0.2183,(22121.0/101326.0)
1,1.0,4044.0,6883.0,0.3701,(4044.0/10927.0)
2,Total,83249.0,29004.0,0.2331,(26165.0/112253.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.151739,0.344745,196.0
1,max f2,0.09551,0.515429,258.0
2,max f0point5,0.236314,0.288517,125.0
3,max accuracy,0.876121,0.902648,0.0
4,max precision,0.708533,0.483871,14.0
5,max recall,0.002781,1.0,397.0
6,max specificity,0.876121,0.99999,0.0
7,max absolute_mcc,0.116401,0.294775,234.0
8,max min_per_class_accuracy,0.131618,0.726033,217.0
9,max mean_per_class_accuracy,0.093782,0.741859,260.0



Gains/Lift Table: Avg response rate:  9.73 %, avg score:  9.72 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.011759,0.388077,3.720069,3.720069,0.362121,0.485527,0.362121,0.485527,0.043745,0.043745,272.006886,272.006886,0.035435
1,2,0.020766,0.347277,3.322719,3.54773,0.323442,0.366766,0.345345,0.434018,0.029926,0.073671,232.271897,254.773049,0.05861
2,3,0.030066,0.324998,2.94217,3.36041,0.286398,0.337356,0.327111,0.404117,0.027363,0.101034,194.21696,236.041032,0.078621
3,4,0.040525,0.290252,3.202654,3.319697,0.311755,0.30932,0.323148,0.379652,0.033495,0.134529,220.265385,231.969674,0.104142
4,5,0.050796,0.283624,2.583841,3.170899,0.251518,0.285916,0.308664,0.360698,0.02654,0.161069,158.384052,217.089943,0.122165
5,6,0.100621,0.242106,2.676158,2.925916,0.260504,0.26143,0.284816,0.311543,0.133339,0.294408,167.615797,192.59159,0.214685
6,7,0.150571,0.203177,2.35251,2.735697,0.228999,0.221705,0.2663,0.28174,0.117507,0.411915,135.251002,173.569659,0.289528
7,8,0.200012,0.17563,2.13234,2.586551,0.207568,0.190955,0.251782,0.259299,0.105427,0.517342,113.234027,158.655052,0.351551
8,9,0.300019,0.136746,1.814658,2.329253,0.176644,0.154505,0.226736,0.224368,0.181477,0.698819,81.465759,132.925288,0.441807
9,10,0.400123,0.100436,1.354861,2.085476,0.131886,0.118285,0.203006,0.197827,0.135627,0.834447,35.486129,108.5476,0.481161




ModelMetricsBinomial: gbm
** Reported on validation data. **

MSE: 0.07754336748209946
RMSE: 0.27846609754528373
LogLoss: 0.2607796747905575
Mean Per-Class Error: 0.2598364028995881
AUC: 0.8013291434108669
AUCPR: 0.25133570208102307
Gini: 0.6026582868217338

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.14221802934777178: 


Unnamed: 0,Unnamed: 1,0.0,1.0,Error,Rate
0,0.0,33018.0,10489.0,0.2411,(10489.0/43507.0)
1,1.0,1518.0,3060.0,0.3316,(1518.0/4578.0)
2,Total,34536.0,13549.0,0.2497,(12007.0/48085.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.142218,0.337618,200.0
1,max f2,0.09282,0.508026,257.0
2,max f0point5,0.232898,0.286492,121.0
3,max accuracy,0.864563,0.904814,0.0
4,max precision,0.864563,1.0,0.0
5,max recall,0.001524,1.0,399.0
6,max specificity,0.864563,1.0,0.0
7,max absolute_mcc,0.118223,0.288988,226.0
8,max min_per_class_accuracy,0.13147,0.723861,212.0
9,max mean_per_class_accuracy,0.082399,0.740164,270.0



Gains/Lift Table: Avg response rate:  9.52 %, avg score:  9.72 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.011584,0.388077,3.658309,3.658309,0.348294,0.485061,0.348294,0.485061,0.042377,0.042377,265.830884,265.830884,0.034033
1,2,0.020297,0.347277,3.710065,3.680528,0.353222,0.366245,0.35041,0.434053,0.032329,0.074705,271.006505,268.052795,0.060133
2,3,0.030051,0.319873,2.687461,3.358211,0.255864,0.336048,0.319723,0.402244,0.026212,0.100917,168.74614,235.821085,0.078323
3,4,0.040158,0.293666,3.176983,3.312599,0.302469,0.309896,0.315381,0.379001,0.03211,0.133028,217.698305,231.259888,0.102642
4,5,0.050244,0.284425,2.94531,3.238867,0.280412,0.286586,0.308361,0.36045,0.029707,0.162735,194.530993,223.886745,0.124327
5,6,0.10001,0.245148,2.66428,2.952948,0.253656,0.263021,0.28114,0.311968,0.132591,0.295325,166.427975,195.294764,0.215867
6,7,0.152085,0.203176,2.290299,2.726054,0.218051,0.222593,0.259538,0.281366,0.119266,0.414592,129.029882,172.605408,0.290129
7,8,0.200021,0.177139,2.118926,2.580553,0.201735,0.191866,0.245685,0.259917,0.101573,0.516164,111.892632,158.055299,0.349409
8,9,0.300114,0.137073,1.817871,2.326185,0.173073,0.155126,0.221468,0.224967,0.181957,0.698121,81.787062,132.61846,0.439887
9,10,0.400042,0.100417,1.329058,2.07711,0.126535,0.118015,0.197754,0.198252,0.132809,0.830931,32.905826,107.711036,0.476229




Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_pr_auc,validation_lift,validation_classification_error
0,,2021-05-30 19:42:01,0.017 sec,0.0,0.296424,0.319205,0.5,0.097343,1.0,0.902657,0.293508,0.314447,0.5,0.095206,1.0,0.904794
1,,2021-05-30 19:42:02,0.793 sec,10.0,0.29433,0.312319,0.778317,0.245437,3.945582,0.192449,0.291431,0.307672,0.779166,0.239764,3.93442,0.222959
2,,2021-05-30 19:42:03,1.637 sec,20.0,0.292568,0.30677,0.777676,0.245242,3.966061,0.222123,0.289688,0.302217,0.77818,0.239807,4.009106,0.223916
3,,2021-05-30 19:42:04,2.475 sec,30.0,0.291151,0.302423,0.777409,0.245749,4.010631,0.196761,0.288281,0.297911,0.777927,0.240337,4.082413,0.224935
4,,2021-05-30 19:42:05,3.291 sec,40.0,0.289897,0.298654,0.776893,0.245605,3.966061,0.211558,0.287043,0.294194,0.77729,0.240031,4.009106,0.22741
5,,2021-05-30 19:42:05,4.084 sec,50.0,0.288883,0.295628,0.77789,0.24657,4.014344,0.212449,0.286041,0.291199,0.778267,0.240643,4.082413,0.227077
6,,2021-05-30 19:42:06,4.893 sec,60.0,0.287967,0.292804,0.777237,0.246579,4.017656,0.206329,0.285146,0.288429,0.777619,0.240821,4.082413,0.219819
7,,2021-05-30 19:42:07,5.688 sec,70.0,0.287234,0.290543,0.777335,0.246856,4.017656,0.20615,0.284424,0.286188,0.777717,0.241135,4.090402,0.218571
8,,2021-05-30 19:42:08,6.497 sec,80.0,0.28658,0.288483,0.777676,0.247227,4.020974,0.205678,0.28378,0.284145,0.77821,0.241877,4.082413,0.196319
9,,2021-05-30 19:42:09,7.358 sec,90.0,0.285993,0.286623,0.778794,0.247759,4.022813,0.211317,0.283197,0.282282,0.779441,0.242649,4.082413,0.232484



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,loan_to_value_ratio_std,24040.646484,1.0,0.434805
1,property_value_std,12112.411133,0.503831,0.219068
2,debt_to_income_ratio_std,8764.608398,0.364575,0.158519
3,loan_amount_std,4896.093262,0.203659,0.088552
4,intro_rate_period_std,3264.677734,0.135798,0.059046
5,no_intro_rate_period_std,1372.376221,0.057086,0.024821
6,term_360,346.675354,0.01442,0.00627
7,income_std,228.102097,0.009488,0.004126
8,conforming,184.747742,0.007685,0.003341
9,debt_to_income_ratio_missing,80.307571,0.00334,0.001452





In [None]:
# start local timer
grid_tic = time.time()

# start grid search start with hyper parameters
hyper_params = {'max_depth' : list(range(3,14,1))}

# build grid search with previously made GBM and hyper parameters
grid = H2OGridSearch(gbm,hyper_params,
                     grid_id = 'depth_grid',
                     search_criteria = {'strategy': "Cartesian"})


# train grid search
grid.train(x=x_names, y=y_name, training_frame=htrain, validation_frame=hvalid)

# end local timer
grid_toc = time.time() - grid_tic

print('GBM Grid Search completed in %.2f s.' % (grid_toc))

In [None]:
## Sort the grid models by AUC
sorted_grid = grid.get_grid(sort_by='auc',decreasing=True)
print(sorted_grid)

In [None]:
## Sort the grid models by logloss
print(grid)

In [None]:
# create hyper parameters and search criteria lists (ranges are inclusive..exclusive))
hyper_params_tune = {'max_depth' : list(range(3,14,1)),
                'sample_rate': [x/100. for x in range(20,101)],
                'col_sample_rate' : [x/100. for x in range(20,101)],
                'col_sample_rate_per_tree': [x/100. for x in range(20,101)],
                'col_sample_rate_change_per_level': [x/100. for x in range(90,111)],
                'min_rows': list(range(1,21)),
                'nbins': [2**x for x in range(4,11)],
                'nbins_cats': [2**x for x in range(4,13)],
                'min_split_improvement': [0,1e-8,1e-6,1e-4],
                'histogram_type': ["UniformAdaptive","QuantilesGlobal","RoundRobin"]}
search_criteria_tune = {'strategy': "RandomDiscrete",
                   'max_runtime_secs': 3600,  ## limit the runtime to 60 minutes
                   'max_models': 100,  ## build no more than 100 models
                   'seed' : SEED,
                   'stopping_rounds' : 5,
                   'stopping_metric' : "AUC",
                   'stopping_tolerance': 1e-3
                   }

In [None]:
# start local timer
fianl_grid_tic = time.time()

# start final gbm model fitting
gbm_final_grid = H2OGradientBoostingEstimator(distribution='bernoulli',
                                              ntrees=10000, learn_rate=0.05, learn_rate_annealing = 0.99,
                                              score_tree_interval = 10, seed = SEED,
                                              stopping_rounds = 5, stopping_metric = "AUC", stopping_tolerance = 1e-4,
                                              monotone_constraints=monotone_constraints)
            
# build grid search with previously made GBM and hyper parameters
final_grid = H2OGridSearch(gbm_final_grid, hyper_params = hyper_params_tune,
                           grid_id = 'final_grid', search_criteria = search_criteria_tune)

# train grid search
final_grid.train(x=x_names, y=y_name,
                 ## early stopping based on timeout (no model should take more than 1 hour - modify as needed)
                 max_runtime_secs = 3600, 
                 training_frame=htrain, validation_frame=hvalid)


# end local timer
final_grid_toc = time.time() - final_grid_tic

print('GBM Grid Search completed in %.2f s.' % (final_grid_toc))

In [None]:
# sort the grid models by AUC
sorted_final_grid = final_grid.get_grid(sort_by='auc',decreasing=True)
print(sorted_final_grid)

In [None]:
# sort the grid models by logloss
print(final_grid)

In [None]:
# select the best model
best_gbm = sorted_final_grid[0]
print(best_gbm)

In [None]:
# submission
best_gbm_submit = best_gbm.predict(h2o.H2OFrame(test)).as_data_frame() 
best_gbm_submit.drop(['predict', '0.0'], axis=1, inplace=True)
best_gbm_submit.columns = ['phat']
best_gbm_submit.to_csv('group5_mgbm_' + str(datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + '.csv'), 
                       index=False)

In [None]:
# shutdown h2o
h2o.cluster().shutdown(prompt=False)