# Regression with H2O autoML

In [None]:
from sklearn.metrics import accuracy_score, mean_squared_error, confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay, explained_variance_score, r2_score, mean_absolute_error
from h2o.automl import H2OAutoML
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import pickle
import h2o

file_path = 'data/'
save_path = 'models/regression/'
model_name = 'automl_h2o_regressor' 

In [None]:
h2o.init()

In [None]:
def get_x_y_data(df):

    X_data = []
    y_data = []
    for index, row in df.iterrows():        

        res = row['n_v'].strip('][ ').split()
        n_list = [float(s) for s in res]
        res = row['bins_v'].strip('][ ').split()
        bins_list = [float(s) for s in res]
        
        res = row['n_xv'].strip('][ ').split()
        n_xv_list = [float(s) for s in res]
        res = row['bins_xv'].strip('][ ').split()
        bins_xv_list = [float(s) for s in res]   
        
        res = row['n_yv'].strip('][ ').split()
        n_yv_list = [float(s) for s in res]
        res = row['bins_yv'].strip('][ ').split()
        bins_yv_list = [float(s) for s in res]
        
        res = row['n_zv'].strip('][ ').split()
        n_zv_list = [float(s) for s in res]
        res = row['bins_zv'].strip('][ ').split()
        bins_zv_list = [float(s) for s in res]
        
        res = row['n_a_v'].strip('][ ').split()
        n_a_list = [float(s) for s in res]
        res = row['bins_a_v'].strip('][ ').split()
        bins_a_list = [float(s) for s in res]

        data_point=[a for a in zip(n_list, bins_list,  n_xv_list, bins_xv_list, n_yv_list, bins_yv_list, n_zv_list,
                                   bins_zv_list,  n_a_list, bins_a_list)]
        X_data.append(data_point)
        y_data.append((row['h2s']))
        
        
    X = np.array(X_data)
    y = np.array(y_data)


    nsamples, nx, ny = X.shape
    X = X.reshape(nsamples, nx*ny)
    return X, y

# Get training and test dataset

In [None]:
train_df = pd.read_csv(file_path + 'behaviour_data.csv')
test_df = pd.read_csv(file_path + 'test_behaviour_data.csv')

In [None]:
# Training set
X_train, y_train = get_x_y_data(train_df)

X_train = X_train.tolist()
y_train = y_train.tolist()

X_train = h2o.H2OFrame(X_train)
y_train = h2o.H2OFrame(y_train, column_names=['h2s'])

train_data = X_train.cbind(y_train)


In [None]:
# Test set
X_test, y_test = get_x_y_data(test_df)

X_test = X_test.tolist()
y_test = y_test.tolist()

X_test_model = h2o.H2OFrame(X_test)
y_test_model = h2o.H2OFrame(y_test, column_names=['h2s'])

# Train H2O autoML models

In [None]:
aml = H2OAutoML(max_models=20, seed=1)
aml.train(y='h2s', training_frame=train_data)

In [None]:
aml.leader

# Test model

In [None]:
preds = aml.predict(y_test_model)
predictions = h2o.as_list(preds)
y_test =  h2o.as_list(y_test_model)

r2_score = r2_score(y_test, predictions)
MAE = mean_absolute_error(y_test, predictions)
RSME = mean_squared_error(y_test, predictions, squared=False)

print("R2 Score          : ", r2_score)
print("Mean_abs_error    : ", MAE)
print("RMSE              : ", RSME)

print(y_test, predictions)

# Regression plot

In [None]:
fig = plt.figure(figsize= (9,9))
sns.regplot(x=y_test, y=predictions)

plt.xlabel('True $H_2S$ ($\mu g/L$)', fontsize=18)
plt.ylabel('Predicted $H_2S$ ($\mu g/L$)', fontsize=18) 
plt.title('H2OAutoML Regression' + '\n$(R²='+str(round(r2_score,3))+')$', fontweight='bold', fontsize=22) 
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.grid()
plt.show()

# Saving model

In [None]:
# Saving model
h2o.save_model(model= aml.leader, path= save_path + model_name)

# Load model

In [None]:
# Load model
#path = 
loaded_model= h2o.load_model(path)

loaded_model.show()