In [28]:
pip install h2o



In [29]:
import pandas as pd
import numpy as np
import h2o
pd.set_option('display.width', 5000)

In [30]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 39 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.5
H2O_cluster_version_age:,6 days
H2O_cluster_name:,H2O_from_python_unknownUser_5t57bb
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.140 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


## Importing Data

In [31]:
wine = h2o.import_file("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv")
wine.head(5)

Parse progress: |█████████████████████████████████████████████████████████| 100%


fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6




## Features and Target Column

In [33]:
# Define features (or predictors)
features = list(wine.columns) 
features.remove('quality')    # removing target column
features

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol']

## Train Test Split

In [34]:
# Split the H2O data frame into training/test sets
# so we can evaluate out-of-bag performance
wine_split = wine.split_frame(ratios = [0.8], seed = 1234)
wine_train = wine_split[0] # using 80% for training
wine_test = wine_split[1]  # using the rest 20% for out-of-bag evaluation

In [35]:
wine_train.shape, wine_test.shape

((3932, 12), (966, 12))

## Model Building

In [36]:
# Build a Gradient Boosting Machines (GBM) model with default settings
# Import the function for GBM
from h2o.estimators.gbm import H2OGradientBoostingEstimator
# Set up GBM for regression
# Add a seed for reproducibility
gbm_default = H2OGradientBoostingEstimator(model_id = 'gbm_default', seed = 1234)
# Use .train() to build the model
gbm_default.train(x = features, 
                  y = 'quality', 
                  training_frame = wine_train)
# Check the GBM model summary
gbm_default

gbm Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm_default


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,17161.0,5.0,5.0,5.0,9.0,30.0,22.64




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 0.33754946668043595
RMSE: 0.5809900745111193
MAE: 0.4582897982543992
RMSLE: 0.0859869651179757
Mean Residual Deviance: 0.33754946668043595

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2021-03-22 19:00:43,0.010 sec,0.0,0.890085,0.676833,0.792252
1,,2021-03-22 19:00:43,0.177 sec,1.0,0.857675,0.650362,0.735607
2,,2021-03-22 19:00:43,0.214 sec,2.0,0.829503,0.629175,0.688075
3,,2021-03-22 19:00:43,0.243 sec,3.0,0.805849,0.614956,0.649392
4,,2021-03-22 19:00:43,0.269 sec,4.0,0.78505,0.603729,0.616303
5,,2021-03-22 19:00:43,0.293 sec,5.0,0.76761,0.595293,0.589226
6,,2021-03-22 19:00:44,0.323 sec,6.0,0.752163,0.587399,0.565749
7,,2021-03-22 19:00:44,0.354 sec,7.0,0.738128,0.57921,0.544834
8,,2021-03-22 19:00:44,0.387 sec,8.0,0.726528,0.572042,0.527844
9,,2021-03-22 19:00:44,0.410 sec,9.0,0.716371,0.565957,0.513187



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,alcohol,3494.373047,1.0,0.371235
1,volatile acidity,1526.859497,0.436948,0.162211
2,free sulfur dioxide,1075.085571,0.307662,0.114215
3,residual sugar,453.860748,0.129883,0.048217
4,total sulfur dioxide,444.470703,0.127196,0.04722
5,pH,435.343323,0.124584,0.04625
6,fixed acidity,432.337097,0.123724,0.045931
7,chlorides,404.599854,0.115786,0.042984
8,citric acid,399.18576,0.114237,0.042409
9,sulphates,374.836487,0.107269,0.039822




## Checking the model on Test set

In [37]:
# Check the model performance on test dataset
gbm_default.model_performance(wine_test)


ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 0.4569904494269439
RMSE: 0.6760106873614824
MAE: 0.5248612169030329
RMSLE: 0.10032043310648844
Mean Residual Deviance: 0.4569904494269439




## Making Predictions

In [38]:
# Use GBM model to make predictions
yhat_test_gbm = gbm_default.predict(wine_test)
yhat_test_gbm.head(5)

gbm prediction progress: |████████████████████████████████████████████████| 100%


predict
5.78661
5.96088
5.32867
6.19424
5.7198




## Model Tunning

In [39]:
# increase the number of trees for more accuracy
ntrees = 10000,
# Row Random picking for more generalization
sample_rate = 0.9, 
# Columnar Random picking for more generalization
col_sample_rate = 0.9,
# Add cross validation 
nfolds = 5,
# Early stopping 
stopping_metric = 'mse', # let early stopping feature determine
stopping_rounds = 15,     # the optimal number of trees
score_tree_interval = 1

In [40]:
# Build a GBM with manual settings, CV and early stopping
# Set up GBM for regression
# Add a seed for reproducibility
gbm_manual_cv_es = H2OGradientBoostingEstimator(
                                      model_id = 'gbm_manual_cv_es', 
                                       seed = 1234,
                                       ntrees = 10000,
                                       sample_rate = 0.9,
                                       col_sample_rate = 0.9,
                                       nfolds = 5,
                                       stopping_metric = 'mse',
                                       stopping_rounds = 15,
                                       score_tree_interval = 1) 
# Use .train() to build the model
gbm_manual_cv_es.train(x = features, 
                       y = 'quality', 
                       training_frame = wine_train)
# Check the model summary
gbm_manual_cv_es.summary()
# Check the cross-validation model performance
gbm_manual_cv_es

gbm Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  gbm_manual_cv_es


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,179.0,179.0,55586.0,5.0,5.0,5.0,7.0,31.0,20.005587




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 0.20686547419217588
RMSE: 0.4548246631309431
MAE: 0.34894778424095163
RMSLE: 0.06741983008017692
Mean Residual Deviance: 0.20686547419217588

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 0.43719485835036365
RMSE: 0.6612071221261637
MAE: 0.5071563697468089
RMSLE: 0.09876420394757868
Mean Residual Deviance: 0.43719485835036365

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,0.5071095,0.00396057,0.50494814,0.50705534,0.51254624,0.508914,0.502084
1,mean_residual_deviance,0.43729985,0.013255834,0.43033668,0.43702114,0.45930818,0.42439777,0.43543547
2,mse,0.43729985,0.013255834,0.43033668,0.43702114,0.45930818,0.42439777,0.43543547
3,r2,0.44752982,0.017638825,0.45243278,0.42834046,0.43110242,0.47032693,0.4554465
4,residual_deviance,0.43729985,0.013255834,0.43033668,0.43702114,0.45930818,0.42439777,0.43543547
5,rmse,0.6612265,0.009958742,0.65600055,0.6610758,0.67772275,0.6514582,0.65987533
6,rmsle,0.09877549,0.0012305396,0.097171605,0.09819931,0.100348435,0.09957606,0.09858205



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2021-03-22 19:03:40,18.211 sec,0.0,0.890085,0.676833,0.792252
1,,2021-03-22 19:03:40,18.224 sec,1.0,0.859876,0.651339,0.739387
2,,2021-03-22 19:03:40,18.235 sec,2.0,0.832126,0.629717,0.692433
3,,2021-03-22 19:03:40,18.248 sec,3.0,0.808096,0.615587,0.653019
4,,2021-03-22 19:03:40,18.263 sec,4.0,0.788204,0.605878,0.621266
5,,2021-03-22 19:03:40,18.278 sec,5.0,0.769911,0.596446,0.592763
6,,2021-03-22 19:03:40,18.292 sec,6.0,0.754075,0.588348,0.568629
7,,2021-03-22 19:03:40,18.309 sec,7.0,0.741317,0.581034,0.549551
8,,2021-03-22 19:03:40,18.324 sec,8.0,0.729126,0.574067,0.531624
9,,2021-03-22 19:03:40,18.339 sec,9.0,0.717939,0.567376,0.515436



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,alcohol,3570.825928,1.0,0.300384
1,volatile acidity,1634.188354,0.45765,0.137471
2,free sulfur dioxide,1259.141235,0.352619,0.105921
3,pH,793.018372,0.222083,0.06671
4,residual sugar,758.123352,0.21231,0.063775
5,density,723.731628,0.202679,0.060881
6,total sulfur dioxide,704.867432,0.197396,0.059295
7,fixed acidity,681.802673,0.190937,0.057354
8,sulphates,622.205444,0.174247,0.052341
9,chlorides,582.682312,0.163179,0.049016




In [41]:
# Check the model performance on test dataset
gbm_manual_cv_es.model_performance(wine_test)


ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 0.42693718980231904
RMSE: 0.6534043080683805
MAE: 0.4924588228316932
RMSLE: 0.09727407043431957
Mean Residual Deviance: 0.42693718980231904




## Grid Search

In [42]:
# define the criteria for full grid search
search_criteria = {'strategy': "Cartesian"}
# define the range of hyper-parameters for grid search
hyper_params = {'sample_rate': [0.7, 0.8, 0.9],
                'col_sample_rate': [0.7, 0.8, 0.9]}

In [45]:
# Set up GBM grid search
# Add a seed for reproducibility
from h2o.grid.grid_search import H2OGridSearch
gbm_full_grid = H2OGridSearch(
                    H2OGradientBoostingEstimator(
                        model_id = 'gbm_full_grid', 
                        seed = 1234,
                        ntrees = 10000,   
                        nfolds = 5,
                        stopping_metric = 'mse', 
                        stopping_rounds = 15,     
                        score_tree_interval = 1),
                    search_criteria = search_criteria, 
                    hyper_params = hyper_params)
# Use .train() to start the grid search
gbm_full_grid.train(x = features, 
                    y = 'quality', 
                    training_frame = wine_train)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [46]:
# Extract the best model from full grid search
gbm_full_grid_sorted = gbm_full_grid.get_grid(sort_by='mse', decreasing=False)
best_model_id = gbm_full_grid_sorted.model_ids[0]
best_gbm_from_full_grid = h2o.get_model(best_model_id)
best_gbm_from_full_grid.summary()


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,195.0,195.0,60300.0,5.0,5.0,5.0,7.0,31.0,19.902565




In [47]:
# Check the model performance on test dataset
best_gbm_from_full_grid.model_performance(wine_test)


ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 0.42254885478062437
RMSE: 0.6500375795141573
MAE: 0.4908349090126119
RMSLE: 0.09684966383616216
Mean Residual Deviance: 0.42254885478062437


