In [1]:
import os
import h2o
from h2o.automl import H2OAutoML

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score

import pandas as pd
import numpy as np

In [2]:
# Load pd_train
pd_train = pd.read_parquet("data/training_class.parquet")
pd_train["label"] = pd_train["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)

(1241, 16094)


In [3]:
# Load pd_test
pd_test = pd.read_parquet("data/testing_class.parquet")
pd_test["label"] = pd_test["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)

(286, 16094)


In [4]:
X_train = pd_train.drop(columns=["Class", "label"])
y_train = pd_train["label"]

X_test = pd_test.drop(columns=["Class", "label"])
y_test = pd_test["label"]

In [2]:
X = pd.DataFrame(np.random.randn(1000, 20), columns=[f"feature_{i}" for i in range(20)])
y = np.random.randint(2, size=1000)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# H2O optimization

In [5]:
# Set the number of cores (e.g., 4 cores)
os.environ['OPENBLAS_NUM_THREADS'] = '4'  # Limits to 4 cores
h2o.init(nthreads=4)  # Use 4 cores in H2O

# Convert y_train and y_test to Pandas Series if they are not already
y_train = pd.Series(y_train, name="label")
y_test = pd.Series(y_test, name="label")

# Convert data to H2O frame
train = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
test = h2o.H2OFrame(pd.concat([X_test, y_test], axis=1))

# Define the AutoML settings
aml = H2OAutoML(
    max_runtime_secs=3600, seed=42
)

# Train the model
aml.train(y="label", training_frame=train)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,1 hour 0 mins
H2O_cluster_timezone:,Asia/Ho_Chi_Minh
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,2 months and 2 days
H2O_cluster_name:,H2O_from_python_m12gbs1_28om3f
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,6.207 Gb
H2O_cluster_total_cores:,24
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
11:03:24.226: _train param, Dropping bad and constant columns: [KRFP301, KRFPC1251, KRFP308, KRFP4308, KRFP4309, KRFP4303, KRFPC2588, KRFP4300, KRFPC1253, KRFPC2589, KRFP4306, KRFPC1259, KRFPC2583, KRFP4307, KRFPC1258, KRFP4304, KRFP4305, KRFPC3, KRFPC4, KRFPC2570, KRFPC5, KRFPC2571, KRFPC6, KRFPC9, KRFPC1249, KRFPC1244, KRFPC1243, KRFPC2577, KRFPC1242, KRFPC2578, KRFPC2579, KRFPC1248, KRFPC2572, KRFPC1247, KRFPC2573, KRFPC1246, KRFPC2574, KRFPC2575, KRFPC3890, KRFPC3891, KRFPC1238, KRFPC2569, KRFPC1233, KRFPC3896, KRFP4325, KRFP4322, KRFPC1231, KRFPC2567, KRFP4323, KRFPC1230, KRFPC1237, KRFPC2562, KRFPC1235, KRFPC2563, KRFP4327, KRFPC1234, KRFP4310, KRFP4319, KRFPC1227, KRFPC2558, KRFPC2559, KRFPC1222, KRFPC2554, KRFP4314, KRFPC2555, KRFPC3886, KRFPC1220, KRFPC2556, KRFP4312, KRF

key,value
Stacking strategy,cross_validation
Number of base models (used / total),3/4
# GBM base models (used / total),1/1
# XGBoost base models (used / total),0/1
# GLM base models (used / total),1/1
# DRF base models (used / total),1/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,313.6166,13.45965,310.30746,334.06183,299.08347,305.94763,318.68268
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mae,0.3969375,0.0129921,0.3900272,0.4189075,0.3892331,0.3878588,0.398661
mean_residual_deviance,0.1987999,0.0136939,0.1946315,0.2209641,0.1837612,0.194984,0.1996586
mse,0.1987999,0.0136939,0.1946315,0.2209641,0.1837612,0.194984,0.1996586
null_deviance,61.476204,0.7866023,61.465958,61.234856,62.339455,60.308876,62.03187
r2,0.1954729,0.0520215,0.206803,0.1146563,0.2597738,0.2009988,0.1951327
residual_deviance,49.31558,2.913567,48.65788,53.915253,46.124058,47.966057,49.91465
rmse,0.4456629,0.0151868,0.4411706,0.4700683,0.4286737,0.4415699,0.4468317
rmsle,0.3123218,0.0122779,0.3058923,0.3329904,0.3016469,0.3078792,0.3132003


In [6]:
# Get the best model and evaluate
lb = aml.leaderboard
print(lb)
best_model = aml.leader
print("Best model performance:", best_model.model_performance(test))

model_id                                                     rmse       mse       mae     rmsle    mean_residual_deviance
StackedEnsemble_BestOfFamily_2_AutoML_2_20241101_110324  0.44575   0.198693  0.396854  0.312419                  0.198693
StackedEnsemble_BestOfFamily_7_AutoML_2_20241101_110324  0.445838  0.198771  0.401854  0.312758                  0.198771
StackedEnsemble_BestOfFamily_4_AutoML_2_20241101_110324  0.44614   0.199041  0.396151  0.312528                  0.199041
StackedEnsemble_BestOfFamily_5_AutoML_2_20241101_110324  0.446337  0.199217  0.3962    0.31257                   0.199217
StackedEnsemble_AllModels_1_AutoML_2_20241101_110324     0.44686   0.199684  0.397454  0.313057                  0.199684
StackedEnsemble_AllModels_4_AutoML_2_20241101_110324     0.446992  0.199802  0.395158  0.313016                  0.199802
StackedEnsemble_AllModels_3_AutoML_2_20241101_110324     0.447153  0.199946  0.39546   0.313199                  0.199946
StackedEnsemble_BestOfFa

In [8]:
from sklearn.metrics import precision_score, recall_score

# Get predictions and calculate additional metrics if needed
preds = aml.leader.predict(test).as_data_frame()
y_pred = (preds['predict'] > 0.5).astype(int)  # Binarize predictions for threshold of 0.5

# Calculate precision, recall, and sensitivity
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)  # Same as sensitivity for positive class
auc = roc_auc_score(y_test, y_pred)
print("Precision:", precision)
print("Recall (Sensitivity):", recall)
print("AUC:", auc)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Precision: 0.9313725490196079
Recall (Sensitivity): 0.8597285067873304
AUC: 0.8221719457013574





# PCA optimization