In [None]:
import h2o
from h2o.automl import H2OAutoML

h2o.init(max_mem_size='12G')
print(h2o.__version__)

In [None]:
%%time

import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time

import os
import pandas as pd
from pipelines import preprocessing_pipeline

train_data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'train.csv')
train_df = pd.read_csv(train_data_file, index_col=0, low_memory=False)
train_df = preprocessing_pipeline.fit_transform(train_df)
train_h2o = h2o.H2OFrame(train_df)
train_h2o.head()

In [None]:
%%time
test_data_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'test.csv')
test_df = pd.read_csv(test_data_file, index_col=0, low_memory=False)
test_df = preprocessing_pipeline.transform(test_df)
test_h2o = h2o.H2OFrame(test_df)
test_h2o.head()

In [None]:
%%time

y = 'bg+1:00'
x = [col for col in train_h2o.columns if col != y]
print(x)
print(y)

In [None]:
%%time

aml = H2OAutoML(max_models=500, seed=1000, max_runtime_secs=3600)
aml.train(x=x, y=y, training_frame=train_h2o)

In [None]:
%%time

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)

In [None]:
%%time
test_file = os.path.join('..', '..', '..', '..', 'data', 'raw', 'test.csv')
test_data = h2o.import_file(test_file)

y_pred = aml.predict(test_data=test_data)

In [None]:
import pandas as pd

sample_submission = pd.read_csv(os.path.join('..', '..', '..', '..', 'data', 'raw', 'sample_submission.csv'), index_col=0)

sample_submission['bg+1:00'] = y_pred.as_data_frame().values
sample_submission.to_csv(f'submission-{os.path.basename(os.getcwd())}.csv')


In [None]:
from sklearn.metrics import root_mean_squared_error, r2_score, PredictionErrorDisplay
import matplotlib.pyplot as plt

y_train = train_h2o[y].as_data_frame().values
y_pred = aml.leader.predict(train_h2o).as_data_frame().values

print(f'RMSE: {root_mean_squared_error(y_true=y_train, y_pred=y_pred)}')
print(f'R2: {r2_score(y_true=y_train, y_pred=y_pred)}')

fig, axs = plt.subplots(ncols=2, figsize=(8, 4))
PredictionErrorDisplay.from_predictions(
    y_true=y_train,
    y_pred=y_pred,
    kind="actual_vs_predicted",
    subsample=100,
    ax=axs[0],
    random_state=0,
)
axs[0].set_title("Actual vs. Predicted values")
PredictionErrorDisplay.from_predictions(
    y_true=y_train,
    y_pred=y_pred,
    kind="residual_vs_predicted",
    subsample=100,
    ax=axs[1],
    random_state=0,
)
axs[1].set_title("Residuals vs. Predicted Values")
fig.suptitle("Plotting cross-validated predictions")
plt.tight_layout()
plt.show();