In [1]:
import pandas as pd

In [2]:
import h2o

In [3]:
from h2o.automl import H2OAutoML

In [8]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,46 mins 20 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,1 month and 14 days
H2O_cluster_name:,H2O_from_python_mardoniofranca_noltt4
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.811 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [9]:
df = h2o.import_file("data/dados_21.csv")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [10]:
# Set response column as a factor
df['Target'] = df['Target'].asfactor()
response='Target'

# Split the dataset into train and test
train, test = df.split_frame(ratios = [.8], seed = 1234)

In [11]:
# Choose which columns to encode
encoded_columns = ["Order_Priority", "Ship_Mode", 
                   "Customer_Name", "Region", "Customer_Segment",
                   "Product_Category","Product_Sub-Category",
                   "Product_Container"]

In [13]:
# For k_fold strategy we need to provide fold column
fold_column = "kfold_column"
train[fold_column] = train.kfold_column(n_folds=5, seed=1234)

In [14]:
from h2o.estimators import H2OTargetEncoderEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [16]:
# Train a TE model
df_te = H2OTargetEncoderEstimator(fold_column=fold_column,
                                       data_leakage_handling="k_fold",
                                       blending=True,
                                       inflection_point=3,
                                       smoothing=10,
                                       noise=0.15,     # In general, the less data you have the more regularization you need
                                       seed=1234)

In [17]:
df_te.train(x=encoded_columns,
                 y=response,
                 training_frame=train)


targetencoder Model Build progress: |████████████████████████████████████████████| (done) 100%


original_names,encoded_column_names
Order_Priority,Order_Priority_te
Ship_Mode,Ship_Mode_te
Customer_Name,Customer_Name_te
Region,Region_te
Customer_Segment,Customer_Segment_te
Product_Category,Product_Category_te
Product_Sub-Category,Product_Sub-Category_te
Product_Container,Product_Container_te


In [18]:
train.head()

Order_Priority,Order_Quantity,Sales,Ship_Mode,Profit,Customer_Name,Region,Customer_Segment,Product_Category,Product_Sub-Category,Product_Container,Target,kfold_column
High,21,2781.82,Express Air,-695.26,Monica Federle,Nunavut,Corporate,Office Supplies,Storage & Organization,Large Box,0,2
High,35,3389.93,Express Air,737.94,Beth Paige,Northwest Territories,Consumer,Furniture,Office Furnishings,Large Box,1,0
Not Specified,7,2039.56,Express Air,-329.49,Bryan Davis,Northwest Territories,Corporate,Office Supplies,Storage & Organization,Large Box,0,0
Medium,24,1168.15,Express Air,-743.96,Muhammed MacIntyre,Northwest Territories,Small Business,Office Supplies,Storage & Organization,Large Box,0,1
Not Specified,45,237.28,Express Air,-2088.68,Bryan Mills,Northwest Territories,Small Business,Office Supplies,Appliances,Large Box,0,3
Critical,17,1368.14,Express Air,171.26,Fred Wasserman,Northwest Territories,Home Office,Office Supplies,Appliances,Large Box,1,3
Low,21,4429.69,Express Air,983.55,Filia McAdams,Atlantic,Small Business,Technology,Copiers and Fax,Large Box,1,4
Not Specified,40,19109.6,Express Air,-379.29,Sanjit Chand,West,Home Office,Technology,Copiers and Fax,Large Box,0,0
High,48,446.53,Express Air,-261.45,Rob Dowd,West,Corporate,Furniture,Office Furnishings,Large Box,0,3
Not Specified,27,2780.88,Express Air,595.38,Tony Chapman,West,Consumer,Furniture,Office Furnishings,Large Box,1,1


In [20]:
## New target encoded train and test sets
train_te = df_te.transform(frame=train, as_training=True)
test_te = df_te.transform(frame=test, noise=0)

In [21]:
train_te.head()

Order_Priority_te,Ship_Mode_te,Customer_Name_te,Region_te,Customer_Segment_te,Product_Category_te,Product_Sub-Category_te,Product_Container_te,Order_Priority,Ship_Mode,Customer_Name,Region,Customer_Segment,Product_Category,Product_Sub-Category,Product_Container,Order_Quantity,Sales,Profit,kfold_column,Target
0.559569,0.58041,0.659098,0.610872,0.583449,0.536438,0.295683,0.400156,High,Express Air,Monica Federle,Nunavut,Corporate,Office Supplies,Storage & Organization,Large Box,21,2781.82,-695.26,2,0
0.446459,0.471306,0.520634,0.421505,0.465287,0.481181,0.460577,0.280789,High,Express Air,Beth Paige,Northwest Territories,Consumer,Furniture,Office Furnishings,Large Box,35,3389.93,737.94,0,1
0.576438,0.5794,0.771075,0.529599,0.548548,0.513629,0.252692,0.388883,Not Specified,Express Air,Bryan Davis,Northwest Territories,Corporate,Office Supplies,Storage & Organization,Large Box,7,2039.56,-329.49,0,0
0.676452,0.677586,0.878011,0.676949,0.67473,0.62169,0.424415,0.468367,Medium,Express Air,Muhammed MacIntyre,Northwest Territories,Small Business,Office Supplies,Storage & Organization,Large Box,24,1168.15,-743.96,1,0
0.432187,0.41584,0.511986,0.407301,0.423128,0.343789,0.420648,0.251629,Not Specified,Express Air,Bryan Mills,Northwest Territories,Small Business,Office Supplies,Appliances,Large Box,45,237.28,-2088.68,3,0
0.414867,0.509595,0.511986,0.501056,0.475322,0.437544,0.514403,0.345384,Critical,Express Air,Fred Wasserman,Northwest Territories,Home Office,Office Supplies,Appliances,Large Box,17,1368.14,171.26,3,1
0.381733,0.406984,0.511986,0.413263,0.313322,0.471017,0.172726,0.203761,Low,Express Air,Filia McAdams,Atlantic,Small Business,Technology,Copiers and Fax,Large Box,21,4429.69,983.55,4,1
0.499392,0.502354,0.511986,0.484094,0.490747,0.564049,0.467456,0.311837,Not Specified,Express Air,Sanjit Chand,West,Home Office,Technology,Copiers and Fax,Large Box,40,19109.6,-379.29,0,0
0.452552,0.458781,0.427635,0.463037,0.453098,0.464202,0.452368,0.29457,High,Express Air,Rob Dowd,West,Corporate,Furniture,Office Furnishings,Large Box,48,446.53,-261.45,3,0
0.666035,0.658068,0.858494,0.630268,0.616822,0.600677,0.596632,0.448849,Not Specified,Express Air,Tony Chapman,West,Consumer,Furniture,Office Furnishings,Large Box,27,2780.88,595.38,1,1


In [26]:
y

'Target'

In [31]:
x_baseline = ['Order_Priority_te','Ship_Mode_te', 'Customer_Name_te',
              'Region_te', 'Customer_Segment_te', 'Product_Category_te',
               'Product_Sub-Category_te', 'Product_Container_te',
               'Order_Quantity','Sales','Target']


In [32]:
x = x_baseline
y = "Target"
x.remove(y)

In [33]:
# For binary classification, response should be a factor
train_te[y] = train_te[y].asfactor()
test_te[y] = test_te[y].asfactor()

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=40, seed=1)
aml.train(x=x, y=y, training_frame=test_te)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

AutoML progress: |
16:37:35.568: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 146.0.

███████████████████████████████████████████████████████████████| (done) 100%


model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
DRF_1_AutoML_3_20240427_163734,0.899432,0.422634,0.836767,0.14678,0.361794,0.130895
XGBoost_lr_search_selection_AutoML_3_20240427_163734_select_grid_model_8,0.899242,0.43839,0.868262,0.173106,0.370557,0.137313
StackedEnsemble_BestOfFamily_1_AutoML_3_20240427_163734,0.893182,0.405132,0.839877,0.136932,0.353953,0.125283
GBM_grid_1_AutoML_3_20240427_163734_model_2,0.89072,0.402463,0.854983,0.143182,0.351284,0.123401
GBM_5_AutoML_3_20240427_163734,0.887689,0.422001,0.847194,0.186932,0.370486,0.13726
StackedEnsemble_AllModels_1_AutoML_3_20240427_163734,0.885417,0.424358,0.829175,0.159659,0.365618,0.133676
XRT_1_AutoML_3_20240427_163734,0.882102,0.44892,0.835305,0.168561,0.380815,0.14502
GBM_grid_1_AutoML_3_20240427_163734_model_3,0.857008,0.470058,0.841349,0.207955,0.392534,0.154083
XGBoost_grid_1_AutoML_3_20240427_163734_model_10,0.855871,0.512545,0.752777,0.198106,0.398012,0.158413
GBM_grid_1_AutoML_3_20240427_163734_model_7,0.854167,0.473796,0.808097,0.172159,0.390276,0.152315


In [34]:
aml.leader

Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,32.0,32.0,13025.0,6.0,15.0,9.5,19.0,40.0,27.65625

Unnamed: 0,0,1,Error,Rate
0,53.0,27.0,0.3375,(27.0/80.0)
1,5.0,61.0,0.0758,(5.0/66.0)
Total,58.0,88.0,0.2192,(32.0/146.0)

metric,threshold,value,idx
max f1,0.3571429,0.7922078,47.0
max f2,0.3333333,0.8683473,49.0
max f0point5,0.625,0.8273381,27.0
max accuracy,0.625,0.8150685,27.0
max precision,0.8,0.9166667,12.0
max recall,0.0769231,1.0,67.0
max specificity,1.0,0.9875,0.0
max absolute_mcc,0.625,0.6307971,27.0
max min_per_class_accuracy,0.5454545,0.7878788,35.0
max mean_per_class_accuracy,0.6,0.8073864,29.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0410959,1.0,1.8434343,1.8434343,0.8333333,1.0,0.8333333,1.0,0.0757576,0.0757576,84.3434343,84.3434343,0.0632576
2,0.0410959,0.95,0.0,1.8434343,0.0,0.0,0.8333333,1.0,0.0,0.0757576,-100.0,84.3434343,0.0632576
3,0.0547945,0.9214744,2.2121212,1.9356061,1.0,0.9302885,0.875,0.9825721,0.030303,0.1060606,121.2121212,93.5606061,0.0935606
4,0.109589,0.8333333,1.9356061,1.9356061,0.875,0.8610424,0.875,0.9218073,0.1060606,0.2121212,93.5606061,93.5606061,0.1871212
5,0.1643836,0.8,2.2121212,2.0277778,1.0,0.8072727,0.9166667,0.8836291,0.1212121,0.3333333,121.2121212,102.7777778,0.3083333
6,0.2054795,0.7333333,1.8434343,1.9909091,0.8333333,0.7655372,0.9,0.8600107,0.0757576,0.4090909,84.3434343,99.0909091,0.3715909
7,0.3013699,0.6547619,1.5800866,1.8601928,0.7142857,0.6965625,0.8409091,0.8080045,0.1515152,0.5606061,58.008658,86.0192837,0.4731061
8,0.4041096,0.5833333,1.6222222,1.7996918,0.7333333,0.6217915,0.8135593,0.7606622,0.1666667,0.7272727,62.2222222,79.9691834,0.5897727
9,0.5205479,0.5,0.9108734,1.6008772,0.4117647,0.5265183,0.7236842,0.7082879,0.1060606,0.8333333,-8.912656,60.0877193,0.5708333
10,0.6027397,0.3571429,1.1060606,1.5334022,0.5,0.411551,0.6931818,0.6678238,0.0909091,0.9242424,10.6060606,53.3402204,0.5867424

Unnamed: 0,0,1,Error,Rate
0,65.0,15.0,0.1875,(15.0/80.0)
1,7.0,59.0,0.1061,(7.0/66.0)
Total,72.0,74.0,0.1507,(22.0/146.0)

metric,threshold,value,idx
max f1,0.4714286,0.8428571,41.0
max f2,0.25,0.8767123,62.0
max f0point5,0.5857143,0.8557047,30.0
max accuracy,0.5857143,0.8493151,30.0
max precision,1.0,1.0,0.0
max recall,0.1,1.0,73.0
max specificity,1.0,1.0,0.0
max absolute_mcc,0.4714286,0.70325,41.0
max min_per_class_accuracy,0.53,0.8375,37.0
max mean_per_class_accuracy,0.4714286,0.8532197,41.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0136986,0.9555,1.1060606,1.1060606,0.5,0.98,0.5,0.98,0.0151515,0.0151515,10.6060606,10.6060606,0.0026515
2,0.0205479,0.932,2.2121212,1.4747475,1.0,0.95,0.6666667,0.97,0.0151515,0.030303,121.2121212,47.4747475,0.017803
3,0.0410959,0.9,2.2121212,1.8434343,1.0,0.91,0.8333333,0.94,0.0454545,0.0757576,121.2121212,84.3434343,0.0632576
4,0.0410959,0.8885714,0.0,1.8434343,0.0,0.0,0.8333333,0.94,0.0,0.0757576,-100.0,84.3434343,0.0632576
5,0.0547945,0.8785714,2.2121212,1.9356061,1.0,0.8857143,0.875,0.9264286,0.030303,0.1060606,121.2121212,93.5606061,0.0935606
6,0.1027397,0.8242857,1.8961039,1.9171717,0.8571429,0.8418367,0.8666667,0.8869524,0.0909091,0.1969697,89.6103896,91.7171717,0.1719697
7,0.1506849,0.775,1.8961039,1.9104683,0.8571429,0.8028571,0.8636364,0.8601948,0.0909091,0.2878788,89.6103896,91.046832,0.2503788
8,0.2123288,0.7428571,1.96633,1.9266862,0.8888889,0.7514286,0.8709677,0.8286175,0.1212121,0.4090909,96.6329966,92.6686217,0.3590909
9,0.3013699,0.6943333,1.8717949,1.9104683,0.8461538,0.7115018,0.8636364,0.7940152,0.1666667,0.5757576,87.1794872,91.046832,0.5007576
10,0.4109589,0.5714286,1.9356061,1.9171717,0.875,0.6316786,0.8666667,0.7507254,0.2121212,0.7878788,93.5606061,91.7171717,0.6878788

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8703448,0.0596063,0.8,0.8275862,0.9310345,0.9310345,0.8620689
aic,,0.0,,,,,
auc,0.9063521,0.0711846,0.8348416,0.8259804,0.9828432,0.9428572,0.9452381
err,0.1296552,0.0596063,0.2,0.1724138,0.0689655,0.0689655,0.1379310
err_count,3.8,1.7888544,6.0,5.0,2.0,2.0,4.0
f0point5,0.8422588,0.0679096,0.7534246,0.8035714,0.8823530,0.9285714,0.8433735
f1,0.8589942,0.0714291,0.7857143,0.7826087,0.9230769,0.9285714,0.875
f2,0.8778023,0.0838597,0.8208955,0.7627119,0.9677419,0.9285714,0.9090909
lift_top_group,1.767619,1.0107874,0.0,2.4166667,2.4166667,2.0714285,1.9333333
loglikelihood,,0.0,,,,,

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2024-04-27 16:37:36,0.326 sec,0.0,,,,,,
,2024-04-27 16:37:36,0.333 sec,5.0,0.4572737,4.3128304,0.7741935,0.752956,1.9356061,0.2941176
,2024-04-27 16:37:36,0.342 sec,10.0,0.4331303,2.2822382,0.7982355,0.7587188,1.9104683,0.2827586
,2024-04-27 16:37:36,0.349 sec,15.0,0.4011261,1.3393806,0.8438447,0.8007712,1.9518717,0.2671233
,2024-04-27 16:37:36,0.360 sec,20.0,0.4009415,0.9094751,0.8446023,0.7897479,1.9356061,0.2671233
,2024-04-27 16:37:36,0.368 sec,25.0,0.3898002,0.894286,0.8611742,0.8109628,1.8961039,0.1917808
,2024-04-27 16:37:36,0.377 sec,30.0,0.3853004,0.6713546,0.8679924,0.808637,1.8434343,0.1917808
,2024-04-27 16:37:36,0.381 sec,32.0,0.3834444,0.6693541,0.8744318,0.8182291,1.8434343,0.2191781

variable,relative_importance,scaled_importance,percentage
Sales,282.9252014,1.0,0.3572225
Product_Sub-Category_te,138.5476532,0.4896971,0.1749308
Order_Quantity,109.7793045,0.3880153,0.1386078
Customer_Name_te,67.2775955,0.2377929,0.084945
Order_Priority_te,48.7136726,0.1721786,0.0615061
Customer_Segment_te,39.2088013,0.1385836,0.0495052
Region_te,34.3677521,0.1214729,0.0433929
Product_Container_te,32.7490387,0.1157516,0.0413491
Product_Category_te,26.8755302,0.0949916,0.0339332
Ship_Mode_te,11.5694475,0.0408922,0.0146076


In [35]:
lb = h2o.automl.get_leaderboard(aml, extra_columns = "ALL")
lb

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse,training_time_ms,predict_time_per_row_ms,algo
DRF_1_AutoML_3_20240427_163734,0.899432,0.422634,0.836767,0.14678,0.361794,0.130895,59,0.032886,DRF
XGBoost_lr_search_selection_AutoML_3_20240427_163734_select_grid_model_8,0.899242,0.43839,0.868262,0.173106,0.370557,0.137313,3410,0.045842,XGBoost
StackedEnsemble_BestOfFamily_1_AutoML_3_20240427_163734,0.893182,0.405132,0.839877,0.136932,0.353953,0.125283,3680,0.060762,StackedEnsemble
GBM_grid_1_AutoML_3_20240427_163734_model_2,0.89072,0.402463,0.854983,0.143182,0.351284,0.123401,86,0.033506,GBM
GBM_5_AutoML_3_20240427_163734,0.887689,0.422001,0.847194,0.186932,0.370486,0.13726,86,0.0576,GBM
StackedEnsemble_AllModels_1_AutoML_3_20240427_163734,0.885417,0.424358,0.829175,0.159659,0.365618,0.133676,3062,0.110749,StackedEnsemble
XRT_1_AutoML_3_20240427_163734,0.882102,0.44892,0.835305,0.168561,0.380815,0.14502,90,0.022894,DRF
GBM_grid_1_AutoML_3_20240427_163734_model_3,0.857008,0.470058,0.841349,0.207955,0.392534,0.154083,77,0.025842,GBM
XGBoost_grid_1_AutoML_3_20240427_163734_model_10,0.855871,0.512545,0.752777,0.198106,0.398012,0.158413,200,0.045408,XGBoost
GBM_grid_1_AutoML_3_20240427_163734_model_7,0.854167,0.473796,0.808097,0.172159,0.390276,0.152315,70,0.025606,GBM


https://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/intro.html

https://medium.com/h2o-ai-brasil/h2o-os-primeiros-passos-fae39077e028

https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

https://docs.h2o.ai/h2o/latest-stable/h2o-docs/training-models.html