In [None]:
import os
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

In [81]:
date = "20250420"
data_dir = '../data/'
output_dir = '../output/'
featureset = 'featureset.csv'

### Run H2O AutoML


In [82]:
h2o.init(max_mem_size="8G")

Checking whether there is an H2O instance running at http://localhost:54321. connected.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_451"; Java(TM) SE Runtime Environment (build 1.8.0_451-b10); Java HotSpot(TM) 64-Bit Server VM (build 25.451-b10, mixed mode)
  Starting server from /Users/fujimoto/Desktop/MedICU/research-dic-prediction-2025/scripts/myenv/lib/python3.13/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/mw/7366jqyd24db0gsndhcd1cl00000gn/T/tmpkgfozy7z
  JVM stdout: /var/folders/mw/7366jqyd24db0gsndhcd1cl00000gn/T/tmpkgfozy7z/h2o_fujimoto_started_from_python.out
  JVM stderr: /var/folders/mw/7366jqyd24db0gsndhcd1cl00000gn/T/tmpkgfozy7z/h2o_fujimoto_started_from_python.err
  Server is running at http://127.0.0.1:54345
Connecting to H2O server at http://127.0.0.1:54345 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Asia/Tokyo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,25 days
H2O_cluster_name:,H2O_from_python_fujimoto_pp3ux7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.097 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [83]:
def get_aml_model(train_path: str, 
                  test_path:str, 
                vars: list, random_seed: int) -> None:
    train = h2o.import_file(os.path.join(data_dir, date, train_path))
    train = train[vars]
    test = h2o.import_file(os.path.join(data_dir, date, test_path)) 
    test = test[vars]

    x = train.columns
    y = 'y'
    x.remove(y)
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()
    aml = H2OAutoML(max_models=20, seed=random_seed, exclude_algos=["StackedEnsemble"], verbosity = "debug")
    aml.train(x=x, y=y, training_frame=train)

    return aml, train, test

In [None]:
#retrieve the features from the featureset for each model
df = pd.read_csv(os.path.join(data_dir,featureset))
minimum_model_features = df[df['minimum model'] == 1]['Feature'].tolist()
compact_model_features = df[df['compact model'] == 1]['Feature'].tolist()
full_model_features = df[df['full model'] == 1]['Feature'].tolist()

In [86]:
vars_full = full_model_features+['y']
vars_minimum = minimum_model_features + ['y']
vars_compact = compact_model_features + ['y']

#### Run H2O AutoML for full model

In [87]:
aml_DIC, train, test = get_aml_model(f'{date}_train_df_model1.csv', f"{date}_test_df_model1.csv", vars_full, 780)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
19:20:35.13: Project: AutoML_1_20250421_192034
19:20:35.15: 5-fold cross-validation will be used.
19:20:35.30: Setting stopping tolerance adaptively based on the training frame: 0.006505725117543193
19:20:35.30: Build control seed: 780
19:20:35.32: training frame: Frame key: AutoML_1_20250421_192034_training_py_47_sid_8209    cols: 64    rows: 23627  chunks: 32    size: 3465522  checksum: 4604309751270896018
19:20:35.32: validation frame: NULL
19:20:35.32: leaderboard frame: NULL
19:20:35.32: blending frame: NULL
19:20:35.32: response column: y
19:20:35.32: fold column: null
19:20:35.32: weights column: null
19:20:35.51: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (7g, 30w)]}, {GLM : [def_1 (1g, 10w)]}, {DRF 

### Check the Results of H2O AutoML

In [89]:
lb_DIC = aml_DIC.leaderboard
lb_DIC.head(rows=lb_DIC.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
XGBoost_grid_1_AutoML_1_20250421_192034_model_2,0.927267,0.0794427,0.289099,0.312588,0.145423,0.0211479
GBM_grid_1_AutoML_1_20250421_192034_model_1,0.925709,0.0816363,0.266839,0.274278,0.147066,0.0216285
GBM_grid_1_AutoML_1_20250421_192034_model_2,0.92409,0.0829687,0.266857,0.282539,0.14787,0.0218654
GBM_1_AutoML_1_20250421_192034,0.922109,0.0821158,0.253244,0.275591,0.147257,0.0216847
XGBoost_grid_1_AutoML_1_20250421_192034_model_3,0.921641,0.0814537,0.270418,0.276948,0.147114,0.0216426
GBM_2_AutoML_1_20250421_192034,0.920818,0.0867704,0.258711,0.301347,0.149699,0.0224099
XGBoost_grid_1_AutoML_1_20250421_192034_model_1,0.91796,0.0833839,0.275988,0.276215,0.147441,0.021739
XGBoost_3_AutoML_1_20250421_192034,0.917024,0.0844456,0.265842,0.266266,0.147965,0.0218937
GBM_5_AutoML_1_20250421_192034,0.916999,0.0950583,0.220422,0.285814,0.156123,0.0243745
GBM_3_AutoML_1_20250421_192034,0.916676,0.0889034,0.245189,0.292245,0.150384,0.0226154


In [None]:
# Retrieve the model by specifying the model ID to be saved from the leaderboard
model_id = "XGBoost_grid_1_AutoML_1_20250421_192034_model_2"
best_model = h2o.get_model(model_id)

# Retrieve second and third models for comparison necessary
#second_model_id = "GBM_grid_1_AutoML_1_20250421_192034_model_1"
#second_model = h2o.get_model(second_model_id)
#third_model_id = "GBM_grid_1_AutoML_1_20250421_192034_model_2"
#third_model = h2o.get_model(third_model_id)

In [None]:
#save the model
model_path_full = h2o.save_model(model=best_model, path=os.path.join(output_dir, f'h2omodels/model1/h2omodel_24{date}_best'), force=True)

#second_model_path_full = h2o.save_model(model=second_model, path=os.path.join(output_dir, f'h2omodels/model1/h2omodel_24{date}_second'), force=True)
#third_model_path_full = h2o.save_model(model=third_model, path=os.path.join(output_dir, f'h2omodels/model1/h2omodel_24{date}_third'), force=True)

In [None]:
with open(f'{date}/model_path_best_latest_model1.txt', 'w') as f:
    f.write(f"best_model: {model_path_full}\n")
    #f.write(f"second_model: {second_model_path_full}\n")
    #f.write(f"third_model: {third_model_path_full}\n")

#### Run H2O AutoML for minimum model


In [65]:
aml_DIC_minimum, train_minimum, test_minimum = get_aml_model(f'{date}_train_df_model1.csv', f"{date}_test_df_model1.csv", vars_minimum, 710)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
18:35:21.319: Project: AutoML_1_20250421_183521
18:35:21.320: 5-fold cross-validation will be used.
18:35:21.333: Setting stopping tolerance adaptively based on the training frame: 0.006505725117543193
18:35:21.333: Build control seed: 710
18:35:21.335: training frame: Frame key: AutoML_1_20250421_183521_training_py_37_sid_9482    cols: 5    rows: 23627  chunks: 32    size: 222988  checksum: 1451434568359112392
18:35:21.335: validation frame: NULL
18:35:21.336: leaderboard frame: NULL
18:35:21.336: blending frame: NULL
18:35:21.336: response column: y
18:35:21.336: fold column: null
18:35:21.336: weights column: null
18:35:21.358: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (7g, 30w)]}, {GLM : [def_1 (1g, 10w

In [93]:
lb_DIC_minimum = aml_DIC_minimum.leaderboard
lb_DIC_minimum.head(rows=lb_DIC_minimum.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_5_AutoML_1_20250421_183521,0.857842,0.0977169,0.154221,0.345948,0.154942,0.0240071
XGBoost_3_AutoML_1_20250421_183521,0.855983,0.0961244,0.160881,0.376658,0.153003,0.0234098
GBM_1_AutoML_1_20250421_183521,0.855981,0.0949462,0.16362,0.360255,0.151984,0.0230992
GBM_2_AutoML_1_20250421_183521,0.855136,0.0970462,0.150052,0.341954,0.153981,0.02371
XGBoost_1_AutoML_1_20250421_183521,0.854178,0.0961085,0.16171,0.365591,0.15277,0.0233387
XGBoost_grid_1_AutoML_1_20250421_183521_model_1,0.85399,0.0965982,0.156171,0.361841,0.153375,0.0235239
GBM_3_AutoML_1_20250421_183521,0.850588,0.0981584,0.145308,0.351008,0.154159,0.023765
XGBoost_grid_1_AutoML_1_20250421_183521_model_2,0.847869,0.0984636,0.147791,0.360278,0.154196,0.0237763
XGBoost_2_AutoML_1_20250421_183521,0.845031,0.0992152,0.145592,0.369879,0.154554,0.0238868
XGBoost_grid_1_AutoML_1_20250421_183521_model_3,0.844786,0.0994074,0.150835,0.377484,0.154408,0.0238417


In [None]:
# Retrieve the model by specifying the model ID to be saved from the leaderboard
model_id = "GBM_5_AutoML_1_20250421_183521"
best_model = h2o.get_model(model_id)

# Retrieve second and third models for comparison necessary
#second_model_id = "XGBoost_3_AutoML_1_20250421_183521"
#second_model = h2o.get_model(second_model_id)
#third_model_id = "GBM_1_AutoML_1_20250421_183521"
#third_model = h2o.get_model(third_model_id)

In [None]:
#save the model
model_path_full = h2o.save_model(model=best_model, path=os.path.join(output_dir, f'h2omodels/model1_minimum/h2omodel_24{date}_best'), force=True)
#second_model_path_full = h2o.save_model(model=second_model, path=os.path.join(output_dir, f'h2omodels/model1_minimum/h2omodel_24{date}_second'), force=True)
#third_model_path_full = h2o.save_model(model=third_model, path=os.path.join(output_dir, f'h2omodels/model1_minimum/h2omodel_24{date}_third'), force=True)

In [None]:
with open(f'{date}/model_path_best_latest_model1_minimum.txt', 'w') as f:
    f.write(f"best_model: {model_path_full}\n")
    #f.write(f"second_model: {second_model_path_full}\n")
    #f.write(f"third_model: {third_model_path_full}\n")

#### Run H2O AutoML for Compact Model

In [68]:
aml_DIC_compact, train_compact, test_compact = get_aml_model(f'{date}_train_df_model1.csv', f"{date}_test_df_model1.csv", vars_compact, 710)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |
18:48:37.138: Project: AutoML_1_20250421_184837
18:48:37.139: 5-fold cross-validation will be used.
18:48:37.151: Setting stopping tolerance adaptively based on the training frame: 0.006505725117543193
18:48:37.151: Build control seed: 710
18:48:37.155: training frame: Frame key: AutoML_1_20250421_184837_training_py_42_sid_bcfd    cols: 25    rows: 23627  chunks: 32    size: 766581  checksum: 8457199322190453172
18:48:37.155: validation frame: NULL
18:48:37.155: leaderboard frame: NULL
18:48:37.155: blending frame: NULL
18:48:37.155: response column: y
18:48:37.155: fold column: null
18:48:37.155: weights column: null
18:48:37.172: Loading execution steps: [{XGBoost : [def_2 (1g, 10w), def_1 (2g, 10w), def_3 (3g, 10w), grid_1 (4g, 90w), lr_search (7g, 30w)]}, {GLM : [def_1 (1g, 10

In [71]:
lb_DIC_compact = aml_DIC_compact.leaderboard
lb_DIC_compact.head(rows=lb_DIC_compact.nrows)

model_id,auc,logloss,aucpr,mean_per_class_error,rmse,mse
GBM_1_AutoML_1_20250421_184837,0.898484,0.0870794,0.225512,0.347443,0.148932,0.0221808
XGBoost_grid_1_AutoML_1_20250421_184837_model_2,0.898229,0.0896516,0.226978,0.296713,0.15022,0.0225662
GBM_5_AutoML_1_20250421_184837,0.895577,0.0949914,0.203553,0.308611,0.154522,0.0238771
XGBoost_grid_1_AutoML_1_20250421_184837_model_1,0.895514,0.088589,0.227149,0.333352,0.150123,0.0225368
GBM_grid_1_AutoML_1_20250421_184837_model_2,0.894664,0.0876524,0.231902,0.302326,0.148605,0.0220833
XGBoost_1_AutoML_1_20250421_184837,0.893536,0.0896118,0.217453,0.324428,0.150718,0.022716
GBM_2_AutoML_1_20250421_184837,0.893499,0.0904963,0.227381,0.306049,0.150755,0.0227269
XGBoost_3_AutoML_1_20250421_184837,0.891178,0.0884784,0.241161,0.31888,0.148666,0.0221016
GBM_3_AutoML_1_20250421_184837,0.890716,0.0923608,0.206473,0.29932,0.151978,0.0230975
XGBoost_2_AutoML_1_20250421_184837,0.886831,0.0925466,0.219491,0.310191,0.151194,0.0228598


In [None]:
# Retrieve the model by specifying the model ID to be saved from the leaderboard
model_id = "GBM_1_AutoML_1_20250421_184837"
best_model = h2o.get_model(model_id)

# Retrieve second and third models for comparison necessary
#second_model_id = "XGBoost_grid_1_AutoML_1_20250421_184837_model_2"
#second_model = h2o.get_model(second_model_id)
#third_model_id = "GBM_5_AutoML_1_20250421_184837"
#third_model = h2o.get_model(third_model_id)

In [None]:
#save the model
model_path_full = h2o.save_model(model=best_model, path=os.path.join(output_dir, f'h2omodels/model1_compact/h2omodel_24{date}_best'), force=True)
#second_model_path_full = h2o.save_model(model=second_model, path=os.path.join(output_dir, f'h2omodels/model1_compact/h2omodel_24{date}_second'), force=True)
#third_model_path_full = h2o.save_model(model=third_model, path=os.path.join(output_dir, f'h2omodels/model1_compact/h2omodel_24{date}_third'), force=True)

In [None]:
with open(f'{date}/model_path_best_latest_model1_compact.txt', 'w') as f:
    f.write(f"best_model: {model_path_full}\n")
    #f.write(f"second_model: {second_model_path_full}\n")
    #f.write(f"third_model: {third_model_path_full}\n")