In [5]:
pip install h2o



### H2O Framework

In [6]:
import pandas as pd
import h2o

### Adjusting Memory  
Number of threads are related to your cpu core count and maximum memory size is related to available memory. You can monitor the current values of these requirements as shown below. Then, you should limit memory and threads in initialization step.

In [7]:
import multiprocessing
print("CPU: ",multiprocessing.cpu_count())

import psutil
print("Memory: ",psutil.virtual_memory())

CPU:  2
Memory:  svmem(total=13653573632, available=12772048896, percent=6.5, used=617541632, free=8849207296, active=1150218240, inactive=3305476096, buffers=79589376, cached=4107235328, shared=1007616, slab=242917376)


In [8]:
#h2o.init()
h2o.init(ip="127.0.0.1", max_mem_size_GB = 100, nthreads = 5)

Checking whether there is an H2O instance running at http://127.0.0.1:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.10" 2021-01-19; OpenJDK Runtime Environment (build 11.0.10+9-Ubuntu-0ubuntu1.18.04); OpenJDK 64-Bit Server VM (build 11.0.10+9-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
  Starting server from /usr/local/lib/python3.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpc6hcpthj
  JVM stdout: /tmp/tmpc6hcpthj/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmpc6hcpthj/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.1
H2O_cluster_version_age:,23 days
H2O_cluster_name:,H2O_from_python_unknownUser_1crwgz
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,100 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


### Data Manipulation
h2o frame is a multi-core supporting data manipulation tool h2o frame is equivalent to Pandas.

In [9]:
# Data can be found here https://www.kaggle.com/serengil/recognizing-faces-in-the-wild

hf_positive = h2o.import_file('train_true_positive_features.csv')
hf_negative = h2o.import_file('train_true_negative_features.csv')
hf = hf_positive.rbind(hf_negative)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [10]:
print("positive instances: ",hf_positive.shape)
print("negative instances: ",hf_negative.shape)
print("total instances: ",hf.shape)

positive instances:  (137863, 34)
negative instances:  (140744, 34)
total instances:  (278607, 34)


In [11]:
hf = hf[['vgg_cosine', 'vgg_euclidean_l2'
         , 'facenet_cosine', 'facenet_euclidean_l2'
         , 'openface_cosine', 'openface_euclidean_l2'
         , 'is_related']]

In [12]:
hf.head()

vgg_cosine,vgg_euclidean_l2,facenet_cosine,facenet_euclidean_l2,openface_cosine,openface_euclidean_l2,is_related
0.618396,1.11211,1.25131,1.58197,1.12544,1.50029,1
0.601191,1.09653,1.14205,1.51133,1.08315,1.47183,1
0.543063,1.04217,1.10449,1.48627,1.14981,1.51645,1
0.618544,1.11224,1.24833,1.58008,1.09367,1.47897,1
0.60665,1.1015,1.15115,1.51733,1.11618,1.49411,1
0.630702,1.12312,1.22153,1.56303,1.20384,1.55167,1
0.742856,1.2189,1.09549,1.48019,1.07032,1.4631,1
0.558462,1.05685,1.11984,1.49655,1.24634,1.57882,1
0.648851,1.13917,1.10773,1.48844,0.612283,1.1066,1
0.677681,1.1642,1.03272,1.43716,0.571711,1.06931,1




In [13]:
#convert target label to factor because this is a binary classification
#otherwise, there would be a regression problem
hf['is_related'] = hf['is_related'].asfactor()

### Train/Test Split

In [14]:
#70% train, 15% test, 15% validation
train, test, validation = hf.split_frame(ratios=[0.70, 0.15], seed=17)

In [15]:
print("train set size: ",train.shape)
print("test set size: ", test.shape)
print("validation set size: ", validation.shape)

train set size:  (195248, 7)
test set size:  (41679, 7)
validation set size:  (41680, 7)


### Modelling

In [16]:
from h2o.automl import H2OAutoML

In [None]:
y_label = hf.columns[-1]
x_labels = hf.columns[0:-1]

In [18]:
x_labels

['vgg_cosine',
 'vgg_euclidean_l2',
 'facenet_cosine',
 'facenet_euclidean_l2',
 'openface_cosine',
 'openface_euclidean_l2']

In [19]:
y_label

'is_related'

In [20]:
#this is a binary classification problem. convert is_related column to enum type instead of numerical
#otherwise, it would be a regression problem
hf[y_label] = hf[y_label].asfactor()

In [21]:
hf.describe()

Rows:278607
Cols:7




Unnamed: 0,vgg_cosine,vgg_euclidean_l2,facenet_cosine,facenet_euclidean_l2,openface_cosine,openface_euclidean_l2,is_related
type,real,real,real,real,real,real,enum
mins,0.004028141498565674,0.08975721,0.009035706520080566,0.1344302,0.002826511859893799,0.075187616,
mean,0.5569502258138221,1.0472938806111483,0.844084310645829,1.2890979528333437,0.6800254552511539,1.1427115158520231,
maxs,1.0553148686885834,1.4528006,1.5036494731903076,1.7341565,1.5858558416366575,1.78093,
sigma,0.1328708290010315,0.13067531818289974,0.19677337294265532,0.1624659448951223,0.26123373546581896,0.23294118693037083,
zeros,0,0,0,0,0,0,
missing,0,0,0,0,0,0,2
0,0.6183964312076569,1.1121118,1.2513097524642944,1.581967,1.1254418790340424,1.5002946,1
1,0.6011905074119568,1.0965314,1.1420531868934631,1.5113261,1.083145149052143,1.4718323,1
2,0.5430631935596466,1.0421739,1.104493826627731,1.4862664,1.1498085260391235,1.5164489,1


In [22]:
aml = H2OAutoML()

In [23]:
aml.train(x = x_labels, y = y_label, training_frame = train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


## Leaderboard

In [24]:
aml.leader

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_AllModels_AutoML_20210417_205035

No model summary for this model

ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.2032472779710565
RMSE: 0.45082954425265487
LogLoss: 0.5909382445839749
Null degrees of freedom: 9960
Residual degrees of freedom: 9944
Null deviance: 13809.843329740066
Residual deviance: 11772.671708601949
AIC: 11806.671708601949
AUC: 0.7486256402243255
AUCPR: 0.7438610366116839
Gini: 0.4972512804486511

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.38455021004688916: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,2500.0,2484.0,0.4984,(2484.0/4984.0)
1,1,828.0,4149.0,0.1664,(828.0/4977.0)
2,Total,3328.0,6633.0,0.3325,(3312.0/9961.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.38455,0.714729,256.0
1,max f2,0.186958,0.840906,355.0
2,max f0point5,0.532965,0.682174,181.0
3,max accuracy,0.473026,0.681056,212.0
4,max precision,0.970154,1.0,0.0
5,max recall,0.084228,1.0,391.0
6,max specificity,0.970154,1.0,0.0
7,max absolute_mcc,0.473026,0.363067,212.0
8,max min_per_class_accuracy,0.493691,0.678571,201.0
9,max mean_per_class_accuracy,0.473026,0.681081,212.0



Gains/Lift Table: Avg response rate: 49.96 %, avg score: 49.56 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010039,0.911765,1.961378,1.961378,0.98,0.936064,0.98,0.936064,0.019691,0.019691,96.137834,96.137834,0.019289
1,2,0.020078,0.885075,1.72121,1.841294,0.86,0.896453,0.92,0.916258,0.017279,0.03697,72.120956,84.129395,0.03376
2,3,0.030017,0.858698,1.799244,1.827371,0.89899,0.872261,0.913043,0.901691,0.017882,0.054852,79.92442,82.737112,0.049636
3,4,0.040056,0.846647,1.881322,1.840893,0.94,0.852014,0.919799,0.88924,0.018887,0.073739,88.132208,84.089267,0.067319
4,5,0.050095,0.835653,1.761238,1.82493,0.88,0.840769,0.911824,0.879527,0.017681,0.091421,76.123769,82.492975,0.082592
5,6,0.10009,0.782919,1.675877,1.750478,0.837349,0.809557,0.874624,0.844577,0.083785,0.175206,67.58765,75.047788,0.150126
6,7,0.150085,0.733713,1.531197,1.677433,0.76506,0.756634,0.838127,0.815282,0.076552,0.251758,53.119652,67.743298,0.203203
7,8,0.20008,0.688434,1.394554,1.606749,0.696787,0.709064,0.80281,0.788741,0.069721,0.321479,39.455431,60.67488,0.242626
8,9,0.30007,0.616648,1.344318,1.519301,0.671687,0.64966,0.759117,0.742396,0.134418,0.455897,34.43182,51.93012,0.311435
9,10,0.40006,0.555882,1.151412,1.427352,0.575301,0.584618,0.713174,0.702962,0.11513,0.571027,15.141155,42.735187,0.341693




ModelMetricsBinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.20876489117956673
RMSE: 0.4569079679536862
LogLoss: 0.6044648422248524
Null degrees of freedom: 195247
Residual degrees of freedom: 195230
Null deviance: 270653.4584340794
Residual deviance: 236041.10302943597
AIC: 236077.10302943597
AUC: 0.7333312691767349
AUCPR: 0.7235596566355378
Gini: 0.4666625383534697

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.35538657539170965: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,42639.0,56017.0,0.5678,(56017.0/98656.0)
1,1,14263.0,82329.0,0.1477,(14263.0/96592.0)
2,Total,56902.0,138346.0,0.36,(70280.0/195248.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.355387,0.700857,273.0
1,max f2,0.172514,0.833281,361.0
2,max f0point5,0.536271,0.670127,181.0
3,max accuracy,0.500664,0.671449,200.0
4,max precision,0.976696,1.0,0.0
5,max recall,0.040149,1.0,398.0
6,max specificity,0.976696,1.0,0.0
7,max absolute_mcc,0.500664,0.342768,200.0
8,max min_per_class_accuracy,0.494017,0.669721,203.0
9,max mean_per_class_accuracy,0.500664,0.671353,200.0



Gains/Lift Table: Avg response rate: 49.47 %, avg score: 49.47 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010003,0.908191,1.880607,1.880607,0.930364,0.932766,0.930364,0.932766,0.018811,0.018811,88.060731,88.060731,0.017433
1,2,0.02,0.883193,1.814261,1.847443,0.897541,0.89532,0.913956,0.914048,0.018138,0.036949,81.426083,84.744256,0.033543
2,3,0.030003,0.857101,1.772967,1.822613,0.877112,0.869355,0.901673,0.899148,0.017734,0.054684,77.29666,82.261301,0.048845
3,4,0.04,0.843981,1.704494,1.793091,0.843238,0.85024,0.887068,0.886924,0.017041,0.071724,70.449391,79.309079,0.062784
4,5,0.050003,0.833175,1.692236,1.772916,0.837174,0.838502,0.877087,0.877238,0.016927,0.088651,69.223608,77.291572,0.076488
5,6,0.100001,0.776768,1.636642,1.704782,0.80967,0.805529,0.84338,0.841385,0.081829,0.17048,63.664152,70.478211,0.139483
6,7,0.150004,0.729223,1.525084,1.644881,0.754481,0.752677,0.813746,0.811815,0.076259,0.246739,52.508434,64.48808,0.191446
7,8,0.200002,0.686294,1.423158,1.589453,0.704057,0.70682,0.786325,0.785567,0.071155,0.317894,42.315753,58.945283,0.233317
8,9,0.300003,0.616619,1.308791,1.495899,0.647478,0.649515,0.740043,0.740217,0.13088,0.448774,30.879064,49.589876,0.29443
9,10,0.399999,0.553235,1.175301,1.415752,0.581438,0.583826,0.700393,0.70112,0.117525,0.566299,17.530077,41.575234,0.329122







In [25]:
lb = aml.leaderboard

In [26]:
#lb.head()
lb.head(rows=lb.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
StackedEnsemble_AllModels_AutoML_20210417_205035,0.733331,0.604465,0.72356,0.357732,0.456908,0.208765
StackedEnsemble_BestOfFamily_AutoML_20210417_205035,0.733316,0.604517,0.723493,0.3636,0.456923,0.208779
GBM_grid__1_AutoML_20210417_205035_model_2,0.733041,0.604877,0.723226,0.362723,0.457054,0.208899
XGBoost_grid__1_AutoML_20210417_205035_model_8,0.732924,0.60488,0.722913,0.357504,0.457086,0.208928
GBM_grid__1_AutoML_20210417_205035_model_4,0.732844,0.605096,0.722932,0.357079,0.457145,0.208982
GBM_1_AutoML_20210417_205035,0.732816,0.604879,0.722938,0.365705,0.457098,0.208938
GBM_grid__1_AutoML_20210417_205035_model_8,0.732752,0.605073,0.723066,0.364917,0.457169,0.209004
GBM_2_AutoML_20210417_205035,0.732541,0.605067,0.722799,0.364123,0.457194,0.209026
XGBoost_grid__1_AutoML_20210417_205035_model_2,0.732535,0.605185,0.722306,0.355423,0.457223,0.209052
GBM_grid__1_AutoML_20210417_205035_model_5,0.73243,0.605148,0.722764,0.360686,0.457237,0.209066




## Saving the Model

In [27]:
saved_model = h2o.save_model(aml.leader, path = "", force=True)

In [28]:
#you can directly load the best model by running the load_model command
restored_aml = h2o.load_model(saved_model)

## Check on Validation

In [30]:
perf = aml.leader.model_performance(validation)
#perf = restored_aml.model_performance(validation)

In [31]:
perf.auc()

0.7349595941637329

In [32]:
perf.accuracy() [0][1]

0.6751679462571977

In [33]:
perf.rmse()

0.45644475763051967