In [1]:
# Import necessary libraries
import h2o
from h2o.automl import H2OAutoML
import logging
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize the H2O cluster
h2o.init()

# Setting up the logger
logging.basicConfig(level=logging.INFO, filename='h2o_automl.log', filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Create directories for storing graphs
os.makedirs('graphs_h2o_automl', exist_ok=True)

# Load Data
logger.info("Loading datasets...")
train_h2o = h2o.import_file("train_h2o_processed.csv")
test_h2o = h2o.import_file("test_h2o_processed.csv")
logger.info("Datasets loaded successfully.")
logger.info(f"Train dataset shape: {train_h2o.shape}")
logger.info(f"Test dataset shape: {test_h2o.shape}")

# Split data into training and validation sets
train, valid = train_h2o.split_frame(ratios=[0.8], seed=42)

# Define target and features
y = 'Response'
x = train.columns
x.remove(y)

# Run H2O AutoML
logger.info("Starting H2O AutoML...")
aml = H2OAutoML(max_models=20, seed=42, project_name="h2o_automl_project", balance_classes=True)
aml.train(x=x, y=y, training_frame=train, validation_frame=valid)
logger.info("H2O AutoML completed.")

# Get leaderboard and best model
lb = aml.leaderboard
best_model = aml.leader
logger.info(f"Best model: {best_model.model_id}")
logger.info(f"Leaderboard: {lb}")

# Save leaderboard
lb_path = "h2o_automl_leaderboard.csv"
h2o.export_file(lb, path=lb_path, force=True)
logger.info(f"Leaderboard saved to {lb_path}")

# Save the best model
model_path = h2o.save_model(model=best_model, path="best_h2o_automl_model", force=True)
logger.info(f"Best model saved to {model_path}")

# Plot feature importances
plt.figure(figsize=(12, 8))
feature_importance = best_model.varimp(use_pandas=True)
sns.barplot(x='scaled_importance', y='variable', data=feature_importance[:10])
plt.title('H2O AutoML Feature Importances')
plt.savefig('graphs_h2o_automl/h2o_feature_importances.png')
plt.show()
logger.info('H2O feature importances plot saved.')

# Shutdown H2O cluster
h2o.shutdown(prompt=False)


Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 22.0.1+8-16, mixed mode, sharing)
  Starting server from C:\Users\paulo\anaconda3\envs\h2o_env\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\paulo\AppData\Local\Temp\tmpk1itzi3w
  JVM stdout: C:\Users\paulo\AppData\Local\Temp\tmpk1itzi3w\h2o_paulo_started_from_python.out
  JVM stderr: C:\Users\paulo\AppData\Local\Temp\tmpk1itzi3w\h2o_paulo_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Costa_Rica
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.4
H2O_cluster_version_age:,2 days
H2O_cluster_name:,H2O_from_python_paulo_iibyru
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.960 Gb
H2O_cluster_total_cores:,0
H2O_cluster_allowed_cores:,0


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
12:53:32.636: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.
12:53:32.642: AutoML: XGBoost is not available; skipping it.
12:53:32.668: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target column to categorical before training.

████████████████████████████████
12:54:39.487: _response param, We have detected that your response column has only 2 unique values (0/1). If you wish to train a binary model instead of a regression model, convert your target