In [1]:
import h2o
from h2o.automl import H2OAutoML

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score

import pandas as pd
import numpy as np

In [2]:
# Load pd_train
pd_train = pd.read_parquet("data/training_class.parquet")
pd_train["label"] = pd_train["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)

(1241, 16094)


In [3]:
# Load pd_test
pd_test = pd.read_parquet("data/testing_class.parquet")
pd_test["label"] = pd_test["Class"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)

(286, 16094)


In [4]:
X_train = pd_train.drop(columns=["Class", "label"])
y_train = pd_train["label"]

X_test = pd_test.drop(columns=["Class", "label"])
y_test = pd_test["label"]

In [2]:
X = pd.DataFrame(np.random.randn(1000, 20), columns=[f"feature_{i}" for i in range(20)])
y = np.random.randint(2, size=1000)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# H2O optimization

In [None]:
# Initialize H2O cluster
h2o.init()

# Convert y_train and y_test to Pandas Series if they are not already
y_train = pd.Series(y_train, name="label")
y_test = pd.Series(y_test, name="label")

# Convert data to H2O frame
train = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))
test = h2o.H2OFrame(pd.concat([X_test, y_test], axis=1))

# Define the AutoML settings
aml = H2OAutoML(
    max_runtime_secs=3600, stopping_metric="AUC", seed=42  # Optimize for AUC
)

# Train the model
aml.train(y="label", training_frame=train)

In [5]:
# Get the best model and evaluate
lb = aml.leaderboard
print(lb)
best_model = aml.leader
print("Best model performance:", best_model.model_performance(test))

model_id                                                    rmse       mse       mae     rmsle    mean_residual_deviance
StackedEnsemble_BestOfFamily_7_AutoML_1_20241101_70557  0.499982  0.249982  0.499733  0.35145                   0.249982
StackedEnsemble_BestOfFamily_1_AutoML_1_20241101_70557  0.500009  0.250009  0.499754  0.351473                  0.250009
StackedEnsemble_BestOfFamily_3_AutoML_1_20241101_70557  0.500318  0.250319  0.500042  0.351662                  0.250319
StackedEnsemble_AllModels_6_AutoML_1_20241101_70557     0.50044   0.25044   0.500141  0.351745                  0.25044
StackedEnsemble_BestOfFamily_8_AutoML_1_20241101_70557  0.500544  0.250544  0.500232  0.351862                  0.250544
StackedEnsemble_AllModels_2_AutoML_1_20241101_70557     0.500612  0.250612  0.499885  0.351815                  0.250612
StackedEnsemble_BestOfFamily_2_AutoML_1_20241101_70557  0.500642  0.250642  0.500321  0.351891                  0.250642
GLM_1_AutoML_1_20241101_70557    