### Inspect H2O.ai AutoML models

You can inspect the H2O.ai models you have created and collected and choose the best one to be inspected and loaded and re-used. Just in case you loose track of all the models you have created ....


<img src="../KNIME_loves_h2o.png" width="600">


##### KNIME workflow
https://hub.knime.com/-/spaces/-/latest/~GABT_OgeoWxWJW9P/


##### GitHub repository
https://github.com/ml-score/knime_meets_python/tree/main/machine_learning


In [1]:
import glob
import json

import numpy as np
import pandas as pd
import pyarrow.parquet as pq

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, auc, average_precision_score, precision_recall_curve

In [3]:
# provide a software like h2o with a java path using KNIME's own java version
# variable java.home from KNIME

# https://hub.knime.com/-/spaces/-/latest/~SGv1Cosah8BXabfa/

# KNIME Snippets (2): Unearthing Hidden Node Gems — Managing Missing Values, Row Numbers and some Quick Java and Paths
# https://medium.com/p/3c3c7acb019f

import os;

# your own Java path might look different obviously :-) - check the link and the Medium article about how to find it

# os.environ["JAVA_HOME"] = "C:\\Users\\x123456789\\software\\knime_4.6.1\\plugins\\org.knime.binary.jre.win32.x86_64_17.0.3.20220621\\jre"

os.environ["JAVA_HOME"] = "/Applications/KNIME 4.7.1.app/Contents/Eclipse/plugins/org.knime.binary.jre.macosx.aarch64_17.0.5.20230320/jre/Contents/Home"


print("setenv JAVA_HOME", os.environ["JAVA_HOME"])

setenv JAVA_HOME /Applications/KNIME 4.7.1.app/Contents/Eclipse/plugins/org.knime.binary.jre.macosx.aarch64_17.0.5.20230320/jre/Contents/Home


In [4]:
# initiate h2o
# pip uninstall h2o
# if it is already running it will cconnect to the running cluster
# # pip install -f http://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o

# conda install -c h2oai h2o

import h2o

from h2o.automl import H2OAutoML

h2o.init()

# under Windows there might be a problem with the progress bar - so you could just turn it off
# https://forum.knime.com/t/python-script-and-h2o-data-frames-error-under-windows/21099/4?u=mlauber71
# h2o.no_progress()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,34 mins 18 secs
H2O_cluster_timezone:,Europe/Berlin
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.4
H2O_cluster_version_age:,1 month and 5 days
H2O_cluster_name:,H2O_from_python_m_lauber_bsubix
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.868 Gb
H2O_cluster_total_cores:,10
H2O_cluster_allowed_cores:,10


In [5]:
# if you do not want to store the files in the working directory
var_path_data = "../"
var_path_model = "../model/"

In [6]:
data_list = []

# you obviously can adapt the search rules
for filename in glob.glob(f"{var_path_model}/H2O_AutoML_Classification*.json"):
    with open(filename, "r") as f:
        data = json.load(f)
        if "Test_AUCPR" in data:
            var_model_name = data["var_model_name_full"]
            test_aucpr = float(data["Test_AUCPR"])
            test_auc = float(data["Test_AUC"])
            data_list.append({"var_model_name_full": var_model_name, "Test_AUCPR": test_aucpr, "Test_AUC": test_auc})

df = pd.DataFrame(data_list)
df = df.sort_values(by="Test_AUCPR", ascending=False)

In [7]:
df.head(10)

Unnamed: 0,var_model_name_full,Test_AUCPR,Test_AUC
0,H2O_AutoML_Classification_20230224_2014h_jupyter,0.8243,0.92864
2,H2O_AutoML_Classification_20230319_1835h_jupyter,0.8243,0.92864
1,H2O_AutoML_Classification_20230603_1116h_jupyter,0.82083,0.92714


In [8]:
max_aucpr = df["Test_AUCPR"].max()
var_model_name_full = df.loc[df["Test_AUCPR"] == max_aucpr, "var_model_name_full"].iloc[0]

print(f"Highest AUCPR: {max_aucpr}, Model name: {var_model_name_full}")

Highest AUCPR: 0.8243, Model name: H2O_AutoML_Classification_20230224_2014h_jupyter


## Apply the H2O.ai model with all the settings

In [None]:
saved_mojo_model = h2o.import_mojo(var_path_model + var_model_name_full + "_model_stored.zip")

In [None]:
saved_mojo_model

In [None]:
# extract important tables from model to store later
feature_imp = saved_mojo_model._model_json['output']['variable_importances'].as_data_frame()

In [None]:
feature_imp.head(25)

In [None]:
gains_lift_table = saved_mojo_model.gains_lift()
avg_response_rate_df = gains_lift_table.as_data_frame()
avg_response_rate_df.head(25)

In [None]:
avg_response_rate_df.to_parquet(var_path_model + var_model_name_full + "_gains_lift_table.parquet", compression='gzip')
avg_response_rate_df.to_excel(var_path_model + var_model_name_full + "_gains_lift_table.xlsx", index=False)

In [None]:
# Read the JSON file back into a Python dictionary
with open(var_path_model + var_model_name_full + "_variable_list.json", "r") as f:
    loaded_dict = json.load(f)

# fill the list of categorical columns
new_cat_cols = loaded_dict['cat_cols']
new_features = loaded_dict['features']

In [None]:
data_to_apply = pq.read_table(var_path_data + "test.parquet").to_pandas()

In [None]:
df_apply = h2o.H2OFrame(data_to_apply[new_features].copy())

In [None]:
shap_plot = saved_mojo_model.shap_summary_plot(df_apply)

In [None]:
# saved_mojo_model.explain(df_apply)

In [None]:
# exm = saved_mojo_model.explain(df_apply)

In [None]:
# the prediction on the validation dataset will be brought back to KNIME
probabilities_df = saved_mojo_model.predict(df_apply).as_data_frame()

In [None]:
probabilities_df.head()

In [None]:
result = pd.concat([df_apply.as_data_frame(), probabilities_df], axis=1)

In [None]:
result.head()

In [None]:
# from sklearn.metrics import average_precision_score
aucpr = average_precision_score(result['Target'], result['p1'], average='weighted', pos_label=1)
print(f'Test AUCPR: {aucpr:.4f}')

In [None]:
# evaluate the best model on the test data
auc_pred = roc_auc_score(result['Target'], result['p1'], average='weighted')
print(f'Test AUC: {auc_pred:.4f}')

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

y_true = result["Target"].astype(int).values
y_score = result["p1"].values

precision, recall, thresholds = precision_recall_curve(y_true, y_score)
auc_pr = np.trapz(precision, recall)

plt.tight_layout()
plt.gcf().set_size_inches(10.24, 7.68)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall - ' + var_model_name_full)
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall curve: AUCPR={0:0.4f}'.format(auc_pr))
plt.savefig(var_path_model + var_model_name_full + "_aucpr_plot.png")
plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

y_true = result["Target"].astype(int).values
y_score = result["p1"].values

fpr, tpr, thresholds = roc_curve(y_true, y_score)
auc_roc = auc(fpr, tpr)

plt.tight_layout()
plt.gcf().set_size_inches(10.24, 7.68)
plt.plot(fpr, tpr, color='b', lw=2, alpha=0.8)
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', alpha=0.8)
plt.xlabel('False Positive Rate - ' + var_model_name_full)
plt.ylabel('True Positive Rate')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Receiver Operating Characteristic curve: AUC={0:0.4f}'.format(auc_roc))
plt.savefig(var_path_model + var_model_name_full + "_auc_plot.png")
plt.show()