In [None]:
import os
import pandas as pd
import h2o
from h2o.automl import H2OAutoML

In [81]:
date = "20250420"
data_dir = '../data/'
output_dir = '../output/'
featureset = 'featureset.csv'

#### Run H2O AutoML

In [None]:
h2o.init(max_mem_size="8G")

In [83]:
def get_aml_model(train_path: str, 
                  test_path:str, 
                vars: list, random_seed: int) -> None:
    train = h2o.import_file(os.path.join(data_dir, date, train_path))
    train = train[vars]
    test = h2o.import_file(os.path.join(data_dir, date, test_path)) 
    test = test[vars]

    x = train.columns
    y = 'y'
    x.remove(y)
    train[y] = train[y].asfactor()
    test[y] = test[y].asfactor()
    aml = H2OAutoML(max_models=20, seed=random_seed, exclude_algos=["StackedEnsemble"], verbosity = "debug")
    aml.train(x=x, y=y, training_frame=train)

    return aml, train, test

In [None]:
#retrieve the features from the featureset for each model
df = pd.read_csv(os.path.join(data_dir,featureset))
minimum_model_features = df[df['minimum model'] == 1]['Feature'].tolist()
compact_model_features = df[df['compact model'] == 1]['Feature'].tolist()
full_model_features = df[df['full model'] == 1]['Feature'].tolist()

In [86]:
vars_full = full_model_features+['y']
vars_minimum = minimum_model_features + ['y']
vars_compact = compact_model_features + ['y']

#### Run H2O AutoML for full model

In [None]:
aml_DIC, train, test = get_aml_model(f'{date}_train_df_model1.csv', f"{date}_test_df_model1.csv", vars_full, 780)

#### Check the Results of H2O AutoML


In [None]:
lb_DIC = aml_DIC.leaderboard
lb_DIC.head(rows=lb_DIC.nrows)

In [None]:
# Retrieve the model by specifying the model ID to be saved from the leaderboard
model_id = "XGBoost_grid_1_AutoML_1_20250421_192034_model_2"
best_model = h2o.get_model(model_id)

# Retrieve second and third models for comparison necessary
#second_model_id = "GBM_grid_1_AutoML_1_20250421_192034_model_1"
#second_model = h2o.get_model(second_model_id)
#third_model_id = "GBM_grid_1_AutoML_1_20250421_192034_model_2"
#third_model = h2o.get_model(third_model_id)

In [None]:
#save the model
model_path_full = h2o.save_model(model=best_model, path=os.path.join(output_dir, f'h2omodels/model1/h2omodel_24{date}_best'), force=True)
#second_model_path_full = h2o.save_model(model=second_model, path=os.path.join(output_dir, f'h2omodels/model1/h2omodel_24{date}_second'), force=True)
#third_model_path_full = h2o.save_model(model=third_model, path=os.path.join(output_dir, f'h2omodels/model1/h2omodel_24{date}_third'), force=True)

In [None]:
with open(f'{date}/model_path_best_latest_model1.txt', 'w') as f:
    f.write(f"best_model: {model_path_full}\n")
    #f.write(f"second_model: {second_model_path_full}\n")
    #f.write(f"third_model: {third_model_path_full}\n")

#### Run H2O AutoML for minimum model

In [None]:
h2o.init(max_mem_size="8G")

In [None]:
aml_DIC_minimum, train_minimum, test_minimum = get_aml_model(f'{date}_train_df_model1.csv', f"{date}_test_df_model1.csv", vars_minimum, 710)

In [None]:
lb_DIC_minimum = aml_DIC_minimum.leaderboard
lb_DIC_minimum.head(rows=lb_DIC_minimum.nrows)

In [None]:
# Retrieve the model by specifying the model ID to be saved from the leaderboard
model_id = "GBM_5_AutoML_1_20250421_183521"
best_model = h2o.get_model(model_id)
# Retrieve second and third models for comparison necessary
#second_model_id = "XGBoost_3_AutoML_1_20250421_183521"
#second_model = h2o.get_model(second_model_id)
#third_model_id = "GBM_1_AutoML_1_20250421_183521"
#third_model = h2o.get_model(third_model_id)

In [None]:
model_path_full = h2o.save_model(model=best_model, path=os.path.join(output_dir, f'h2omodels/model1_minimum/h2omodel_24{date}_best'), force=True)
#second_model_path_full = h2o.save_model(model=second_model, path=os.path.join(output_dir, f'h2omodels/model1_minimum/h2omodel_24{date}_second'), force=True)
#third_model_path_full = h2o.save_model(model=third_model, path=os.path.join(output_dir, f'h2omodels/model1_minimum/h2omodel_24{date}_third'), force=True)

In [None]:
with open(f'{date}/model_path_best_latest_model1_minimum.txt', 'w') as f:
    f.write(f"best_model: {model_path_full}\n")
    #f.write(f"second_model: {second_model_path_full}\n")
    #f.write(f"third_model: {third_model_path_full}\n")

#### Run H2O AutoML for Compact Model

In [None]:
h2o.init(max_mem_size="8G")

In [None]:
aml_DIC_compact, train_compact, test_compact = get_aml_model(f'{date}_train_df_model1.csv', f"{date}_test_df_model1.csv", vars_compact, 710)

In [None]:
lb_DIC_compact = aml_DIC_compact.leaderboard
lb_DIC_compact.head(rows=lb_DIC_compact.nrows)

In [None]:
# Retrieve the model by specifying the model ID to be saved from the leaderboard
model_id = "GBM_1_AutoML_1_20250421_184837"
best_model = h2o.get_model(model_id)
# Retrieve second and third models for comparison necessary
#second_model_id = ""
##second_model = h2o.get_model(second_model_id)
#third_model_id = ""
#third_model = h2o.get_model(third_model_id)

In [None]:
model_path_full = h2o.save_model(model=best_model, path=os.path.join(output_dir, f'h2omodels/model1_compact/h2omodel_24{date}_best'), force=True)
#second_model_path_full = h2o.save_model(model=second_model, path=os.path.join(output_dir, f'h2omodels/model1_compact/h2omodel_24{date}_second'), force=True)
#third_model_path_full = h2o.save_model(model=third_model, path=os.path.join(output_dir, f'h2omodels/model1_compact/h2omodel_24{date}_third'), force=True)

In [None]:
with open(f'{date}/model_path_best_latest_model1_compact.txt', 'w') as f:
    f.write(f"best_model: {model_path_full}\n")
    #f.write(f"second_model: {second_model_path_full}\n")
    #f.write(f"third_model: {third_model_path_full}\n")