In [1]:
import pandas as pd

In [2]:
import h2o

In [3]:
from h2o.automl import H2OAutoML

In [8]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,46 mins 20 secs
H2O_cluster_timezone:,America/Sao_Paulo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.1
H2O_cluster_version_age:,1 month and 14 days
H2O_cluster_name:,H2O_from_python_mardoniofranca_noltt4
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.811 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [9]:
df = h2o.import_file("data/dados_21.csv")

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [10]:
# Set response column as a factor
df['Target'] = df['Target'].asfactor()
response='Target'

# Split the dataset into train and test
train, test = df.split_frame(ratios = [.8], seed = 1234)

In [11]:
# Choose which columns to encode
encoded_columns = ["Order_Priority", "Ship_Mode", 
                   "Customer_Name", "Region", "Customer_Segment",
                   "Product_Category","Product_Sub-Category",
                   "Product_Container"]

In [13]:
# For k_fold strategy we need to provide fold column
fold_column = "kfold_column"
train[fold_column] = train.kfold_column(n_folds=5, seed=1234)

In [14]:
from h2o.estimators import H2OTargetEncoderEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [16]:
# Train a TE model
df_te = H2OTargetEncoderEstimator(fold_column=fold_column,
                                       data_leakage_handling="k_fold",
                                       blending=True,
                                       inflection_point=3,
                                       smoothing=10,
                                       noise=0.15,     # In general, the less data you have the more regularization you need
                                       seed=1234)

In [17]:
df_te.train(x=encoded_columns,
                 y=response,
                 training_frame=train)


targetencoder Model Build progress: |████████████████████████████████████████████| (done) 100%


original_names,encoded_column_names
Order_Priority,Order_Priority_te
Ship_Mode,Ship_Mode_te
Customer_Name,Customer_Name_te
Region,Region_te
Customer_Segment,Customer_Segment_te
Product_Category,Product_Category_te
Product_Sub-Category,Product_Sub-Category_te
Product_Container,Product_Container_te


In [18]:
train.head()

Order_Priority,Order_Quantity,Sales,Ship_Mode,Profit,Customer_Name,Region,Customer_Segment,Product_Category,Product_Sub-Category,Product_Container,Target,kfold_column
High,21,2781.82,Express Air,-695.26,Monica Federle,Nunavut,Corporate,Office Supplies,Storage & Organization,Large Box,0,2
High,35,3389.93,Express Air,737.94,Beth Paige,Northwest Territories,Consumer,Furniture,Office Furnishings,Large Box,1,0
Not Specified,7,2039.56,Express Air,-329.49,Bryan Davis,Northwest Territories,Corporate,Office Supplies,Storage & Organization,Large Box,0,0
Medium,24,1168.15,Express Air,-743.96,Muhammed MacIntyre,Northwest Territories,Small Business,Office Supplies,Storage & Organization,Large Box,0,1
Not Specified,45,237.28,Express Air,-2088.68,Bryan Mills,Northwest Territories,Small Business,Office Supplies,Appliances,Large Box,0,3
Critical,17,1368.14,Express Air,171.26,Fred Wasserman,Northwest Territories,Home Office,Office Supplies,Appliances,Large Box,1,3
Low,21,4429.69,Express Air,983.55,Filia McAdams,Atlantic,Small Business,Technology,Copiers and Fax,Large Box,1,4
Not Specified,40,19109.6,Express Air,-379.29,Sanjit Chand,West,Home Office,Technology,Copiers and Fax,Large Box,0,0
High,48,446.53,Express Air,-261.45,Rob Dowd,West,Corporate,Furniture,Office Furnishings,Large Box,0,3
Not Specified,27,2780.88,Express Air,595.38,Tony Chapman,West,Consumer,Furniture,Office Furnishings,Large Box,1,1


In [20]:
## New target encoded train and test sets
train_te = df_te.transform(frame=train, as_training=True)
test_te = df_te.transform(frame=test, noise=0)

In [21]:
train_te.head()

Order_Priority_te,Ship_Mode_te,Customer_Name_te,Region_te,Customer_Segment_te,Product_Category_te,Product_Sub-Category_te,Product_Container_te,Order_Priority,Ship_Mode,Customer_Name,Region,Customer_Segment,Product_Category,Product_Sub-Category,Product_Container,Order_Quantity,Sales,Profit,kfold_column,Target
0.559569,0.58041,0.659098,0.610872,0.583449,0.536438,0.295683,0.400156,High,Express Air,Monica Federle,Nunavut,Corporate,Office Supplies,Storage & Organization,Large Box,21,2781.82,-695.26,2,0
0.446459,0.471306,0.520634,0.421505,0.465287,0.481181,0.460577,0.280789,High,Express Air,Beth Paige,Northwest Territories,Consumer,Furniture,Office Furnishings,Large Box,35,3389.93,737.94,0,1
0.576438,0.5794,0.771075,0.529599,0.548548,0.513629,0.252692,0.388883,Not Specified,Express Air,Bryan Davis,Northwest Territories,Corporate,Office Supplies,Storage & Organization,Large Box,7,2039.56,-329.49,0,0
0.676452,0.677586,0.878011,0.676949,0.67473,0.62169,0.424415,0.468367,Medium,Express Air,Muhammed MacIntyre,Northwest Territories,Small Business,Office Supplies,Storage & Organization,Large Box,24,1168.15,-743.96,1,0
0.432187,0.41584,0.511986,0.407301,0.423128,0.343789,0.420648,0.251629,Not Specified,Express Air,Bryan Mills,Northwest Territories,Small Business,Office Supplies,Appliances,Large Box,45,237.28,-2088.68,3,0
0.414867,0.509595,0.511986,0.501056,0.475322,0.437544,0.514403,0.345384,Critical,Express Air,Fred Wasserman,Northwest Territories,Home Office,Office Supplies,Appliances,Large Box,17,1368.14,171.26,3,1
0.381733,0.406984,0.511986,0.413263,0.313322,0.471017,0.172726,0.203761,Low,Express Air,Filia McAdams,Atlantic,Small Business,Technology,Copiers and Fax,Large Box,21,4429.69,983.55,4,1
0.499392,0.502354,0.511986,0.484094,0.490747,0.564049,0.467456,0.311837,Not Specified,Express Air,Sanjit Chand,West,Home Office,Technology,Copiers and Fax,Large Box,40,19109.6,-379.29,0,0
0.452552,0.458781,0.427635,0.463037,0.453098,0.464202,0.452368,0.29457,High,Express Air,Rob Dowd,West,Corporate,Furniture,Office Furnishings,Large Box,48,446.53,-261.45,3,0
0.666035,0.658068,0.858494,0.630268,0.616822,0.600677,0.596632,0.448849,Not Specified,Express Air,Tony Chapman,West,Consumer,Furniture,Office Furnishings,Large Box,27,2780.88,595.38,1,1


In [26]:
y

'Target'

In [31]:
x_baseline = ['Order_Priority_te','Ship_Mode_te', 'Customer_Name_te',
              'Region_te', 'Customer_Segment_te', 'Product_Category_te',
               'Product_Sub-Category_te', 'Product_Container_te',
               'Order_Quantity','Sales','Target']


In [32]:
x = x_baseline
y = "Target"
x.remove(y)

In [None]:
# For binary classification, response should be a factor
train_te[y] = train_te[y].asfactor()
test_te[y] = test_te[y].asfactor()

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=40, seed=1)
aml.train(x=x, y=y, training_frame=test_te)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

AutoML progress: |
16:37:35.568: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 146.0.

████████████████████████████████████████████████████

In [None]:
x = train.columns
y = "Target"
x.remove(y)

In [None]:
# For binary classification, response should be a factor
train[y] = train[y].asfactor()
test[y] = test[y].asfactor()

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=40, seed=1)
aml.train(x=x, y=y, training_frame=train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default (10 rows)

https://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/intro.html

https://medium.com/h2o-ai-brasil/h2o-os-primeiros-passos-fae39077e028

https://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

https://docs.h2o.ai/h2o/latest-stable/h2o-docs/training-models.html