In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from itertools import product
import warnings

In [2]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2

from modules.config import *
from modules.storage import get_results_df, get_availability_model_data
from modules.neural_network import execute_stage, get_first_stage_hyperparameters,get_second_stage_hyperparameters, get_third_stage_hyperparameters

# Neural Network Training
In this notebook we train neural networks to predict availability. We first find the best subset of hyperparameters and then fit models with these hyperparameters in all resolutions.
We use a multi-staged grid search, where we determine a set of best hyperparameters in one stage and then use it in the consecutive stages.
While this approach will not guarantee to find the best hyperparameters, we expect it to be a good approximation, as a full scale grid search is computationally intractable.  

To reduce training time, avoid overfitting and ensure that the model is sufficiently trained we use a high number of epochs and stop the model when the validation loss does not decrease any further.
As we found that the validation loss changes in an unstable manner, we use a patience of 50 epochs.
This means that Early Stopping will be activated after 50 epochs if the validation loss does not decrease any further.
In addition we restore the weights of the epoch when the validation loss decreased last time.  

In [3]:
def get_model_data(h3_res, time_interval_length):
    model_data_train, model_data_test = get_availability_model_data(
        h3_res, time_interval_length
    )
    return model_data_train, model_data_test


In the first stage, we find the best batch size by training simple single-layer Neural Networks, where the number of nodes is equal to the number of input features. The exact batch sizes, that we tried out, can be found in neural networks module.

In [4]:
execute_stage(
    get_model_data,
    NN_FIRST_STAGE_AVAILABILITY_RESULTS_PATH,
    get_first_stage_hyperparameters,
    TUNE_H3_RESOLUTION,
    TUNE_TIME_INTERVAL_LENGTH,
)

  0%|          | 0/5 [00:00<?, ?it/s]

[13:26:37] batch_size: 512 - nodes_per_feature: 1 - n_layers: 1 - activation: relu - dropout: -1 # already trained
[13:26:37] batch_size: 256 - nodes_per_feature: 1 - n_layers: 1 - activation: relu - dropout: -1 # already trained
[13:26:37] batch_size: 128 - nodes_per_feature: 1 - n_layers: 1 - activation: relu - dropout: -1 # already trained
[13:26:37] batch_size: 64 - nodes_per_feature: 1 - n_layers: 1 - activation: relu - dropout: -1 # already trained
[13:26:37] batch_size: 32 - nodes_per_feature: 1 - n_layers: 1 - activation: relu - dropout: -1 # already trained


In [5]:
results = get_results_df(NN_FIRST_STAGE_AVAILABILITY_RESULTS_PATH)

best_batch_size = (
    results[
        (results["h3_res"] == TUNE_H3_RESOLUTION)
        & (results["time_interval_length"] == TUNE_TIME_INTERVAL_LENGTH)
    ]
    .sort_values(by="val_mse", ascending=True)["batch_size"]
    .iloc[0]
)

first_stage_hyperparameters = get_first_stage_hyperparameters()
batch_sizes = list(map(lambda x: x['batch_size'], first_stage_hyperparameters))
max_batch_size = max(batch_sizes)
min_batch_size = min(batch_sizes)

print(f"best batch_size: **{best_batch_size}** - min: {min_batch_size} - max: {max_batch_size}")


best batch_size: **64** - min: 32 - max: 512


We find that the best batch size when predicting demand is **64**.

In the second stage, we find the best architecture of the model by varying the number of layers, nodes and two common hidden activation functions, namely the rectified linear unit and the hyperbolic tangent.

In [6]:
get_hyperparameters = lambda : get_second_stage_hyperparameters(best_batch_size)
execute_stage(
    get_model_data,
    NN_SECOND_STAGE_AVAILABILITY_RESULTS_PATH,
    get_hyperparameters,
    TUNE_H3_RESOLUTION,
    TUNE_TIME_INTERVAL_LENGTH,
)

  0%|          | 0/18 [00:00<?, ?it/s]

[13:26:38] batch_size: 64 - nodes_per_feature: 0.5 - n_layers: 1 - activation: relu - dropout: -1 # already trained
[13:26:38] batch_size: 64 - nodes_per_feature: 0.5 - n_layers: 1 - activation: tanh - dropout: -1 # already trained
[13:26:38] batch_size: 64 - nodes_per_feature: 0.5 - n_layers: 2 - activation: relu - dropout: -1 # already trained
[13:26:38] batch_size: 64 - nodes_per_feature: 0.5 - n_layers: 2 - activation: tanh - dropout: -1 # already trained
[13:26:38] batch_size: 64 - nodes_per_feature: 0.5 - n_layers: 3 - activation: relu - dropout: -1 # already trained
[13:26:38] batch_size: 64 - nodes_per_feature: 0.5 - n_layers: 3 - activation: tanh - dropout: -1 # already trained
[13:26:38] batch_size: 64 - nodes_per_feature: 1 - n_layers: 1 - activation: relu - dropout: -1 # already trained
[13:26:38] batch_size: 64 - nodes_per_feature: 1 - n_layers: 1 - activation: tanh - dropout: -1 # already trained
[13:26:38] batch_size: 64 - nodes_per_feature: 1 - n_layers: 2 - activation:

In [7]:
results = get_results_df(NN_SECOND_STAGE_AVAILABILITY_RESULTS_PATH)
best_model = (
    results[
        (results["h3_res"] == TUNE_H3_RESOLUTION)
        & (results["time_interval_length"] == TUNE_TIME_INTERVAL_LENGTH)
    ]
    .sort_values(by="val_mse", ascending=True)
    .iloc[0]
)

best_config = {
    "batch_size": best_model["batch_size"],
    "nodes_per_feature": best_model["nodes_per_feature"],
    "n_layers": best_model["n_layers"],
    "activation": best_model["activation"],
}

best_config


{'batch_size': 64,
 'nodes_per_feature': 1.5,
 'n_layers': 3,
 'activation': 'tanh'}

In the third stage, we improve the generalizability by adding a dropout layer after every hidden layer and varying the dropout rate between 0 (no dropout) and 0.5 (dropout nodes half of the time).

In [8]:
get_hyperparameters = lambda : get_third_stage_hyperparameters(
    best_batch_size=best_model["batch_size"],
    best_nodes_per_feature=best_model["nodes_per_feature"],
    best_n_layers=best_model["n_layers"],
    best_activation=best_model["activation"],
)
execute_stage(
    get_model_data,
    NN_THIRD_STAGE_AVAILABILITY_RESULTS_PATH,
    get_hyperparameters,
    TUNE_H3_RESOLUTION,
    TUNE_TIME_INTERVAL_LENGTH,
)


  0%|          | 0/5 [00:00<?, ?it/s]

[13:26:38] batch_size: 64 - nodes_per_feature: 1.5 - n_layers: 3 - activation: tanh - dropout: 0 # already trained
[13:26:38] batch_size: 64 - nodes_per_feature: 1.5 - n_layers: 3 - activation: tanh - dropout: 0.05 # already trained
[13:26:38] batch_size: 64 - nodes_per_feature: 1.5 - n_layers: 3 - activation: tanh - dropout: 0.1 # already trained
[13:26:38] batch_size: 64 - nodes_per_feature: 1.5 - n_layers: 3 - activation: tanh - dropout: 0.2 # already trained
[13:26:38] batch_size: 64 - nodes_per_feature: 1.5 - n_layers: 3 - activation: tanh - dropout: 0.5 # already trained


In [9]:
results = get_results_df(NN_THIRD_STAGE_AVAILABILITY_RESULTS_PATH)
best_dropout = (
	results[
		(results["h3_res"] == TUNE_H3_RESOLUTION)
		& (results["time_interval_length"] == TUNE_TIME_INTERVAL_LENGTH)
	]
	.sort_values(by="val_mse", ascending=True)["dropout"]
	.iloc[0]
)
best_config = {
	**best_config,
	"dropout": best_dropout,
}
best_config
		

{'batch_size': 64,
 'nodes_per_feature': 1.5,
 'n_layers': 3,
 'activation': 'tanh',
 'dropout': 0.1}

Finally, we will use these best hyperparameters to train one model for each H3 and time resolution.

In [10]:
for h3_res, time_interval_length in tqdm(
    list(product(PREDICTIVE_H3_RESOLUTIONS, CALC_TIME_INTERVAL_LENGTHS))
    + ADDITIONAL_PREDICTIVE_RESOLUTIONS
):
    tqdm.write(f"h3_res: {h3_res}, time_interval_length: {time_interval_length}", end="\r")
    execute_stage(
        get_model_data,
        NN_FOURTH_STAGE_AVAILABILITY_RESULTS_PATH,
        lambda: [best_config],
        h3_res,
        time_interval_length,
        test_phase=True,
        silent=True,
    )
    tqdm.write(f"h3_res: {h3_res}, time_interval_length: {time_interval_length} done")


  0%|          | 0/9 [00:00<?, ?it/s]

h3_res: 7, time_interval_length: 1 done
h3_res: 7, time_interval_length: 2 done
h3_res: 7, time_interval_length: 6 done
h3_res: 7, time_interval_length: 24 done
h3_res: 8, time_interval_length: 1 done
h3_res: 8, time_interval_length: 2 done
h3_res: 8, time_interval_length: 6 done
h3_res: 8, time_interval_length: 24 done
h3_res: 9, time_interval_length: 24 done


In [11]:
results = get_results_df(NN_FOURTH_STAGE_AVAILABILITY_RESULTS_PATH)
results

Unnamed: 0,h3_res,time_interval_length,batch_size,nodes_per_feature,n_layers,activation,dropout,train_duration,test_mse,test_rmse,test_mae,test_non_zero_mape,test_zero_accuracy
0,7,1,64,1.5,3,tanh,0.1,855.901519,45.103438,6.715909,2.734877,0.450544,0.855141
1,7,2,64,1.5,3,tanh,0.1,560.716685,47.166135,6.867761,2.816211,0.44267,0.853768
2,7,6,64,1.5,3,tanh,0.1,236.145673,53.590551,7.320557,3.066465,0.427757,0.846844
3,7,24,64,1.5,3,tanh,0.1,94.846507,93.34804,9.661679,3.883434,0.341454,0.849619
4,8,1,64,1.5,3,tanh,0.1,3703.94564,5.674175,2.382053,0.983003,0.59628,0.865919
5,8,2,64,1.5,3,tanh,0.1,1830.123308,5.697243,2.38689,0.961181,0.589726,0.862052
6,8,6,64,1.5,3,tanh,0.1,641.114363,6.773018,2.602502,1.063654,0.559425,0.853029
7,8,24,64,1.5,3,tanh,0.1,230.024005,9.372776,3.061499,1.280754,0.477692,0.889425
8,9,24,64,1.5,3,tanh,0.1,498.783041,1.81226,1.346202,0.646167,0.566795,0.822044
