In [1]:
import pandas as pd
from tqdm.notebook import tqdm
from itertools import product

In [2]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2

from modules.config import *
from modules.storage import get_results_df, get_demand_model_data, get_demand_orig_dest_model_data
from modules.neural_network import execute_stage, get_first_stage_hyperparameters,get_second_stage_hyperparameters, get_third_stage_hyperparameters

# Neural Network Training
In this notebook we train neural networks to predict demand. We first find the best subset of hyperparameters and then fit models with these hyperparameters in all resolutions.
We use a multi-staged grid search, where we determine a set of best hyperparameters in one stage and then use it in the consecutive stages.
While this approach will not guarantee to find the best hyperparameters, we expect it to be a good approximation, as a full scale grid search is computationally intractable.  

To reduce training time, avoid overfitting and ensure that the model is sufficiently trained we use a high number of epochs and stop the model when the validation loss does not decrease any further.
As we found that the validation loss changes in an unstable manner, we use a patience of 50 epochs.
This means that Early Stopping will be activated after 50 epochs if the validation loss does not decrease any further.
In addition we restore the weights of the epoch when the validation loss decreased last time.  

In [3]:
def get_model_data(h3_res, time_interval_length):
    model_data_train, model_data_test = get_demand_model_data(
        h3_res, time_interval_length
    )
    return model_data_train, model_data_test


In the first stage, we find the best batch size by training simple single-layer Neural Networks, where the number of nodes is equal to the number of input features. The exact batch sizes, that we tried out, can be found in neural networks module.

In [4]:
execute_stage(
    get_model_data,
    NN_FIRST_STAGE_DEMAND_RESULTS_PATH,
    get_first_stage_hyperparameters,
    TUNE_H3_RESOLUTION,
    TUNE_TIME_INTERVAL_LENGTH,
)

  0%|          | 0/5 [00:00<?, ?it/s]

[19:21:57] batch_size: 512 - nodes_per_feature: 1 - n_layers: 1 - activation: relu - dropout: -1 # already trained
[19:21:57] batch_size: 256 - nodes_per_feature: 1 - n_layers: 1 - activation: relu - dropout: -1 # already trained
[19:21:57] batch_size: 128 - nodes_per_feature: 1 - n_layers: 1 - activation: relu - dropout: -1 # already trained
[19:21:57] batch_size: 64 - nodes_per_feature: 1 - n_layers: 1 - activation: relu - dropout: -1 # already trained
[19:21:57] batch_size: 32 - nodes_per_feature: 1 - n_layers: 1 - activation: relu - dropout: -1 # already trained


In [5]:
results = get_results_df(NN_FIRST_STAGE_DEMAND_RESULTS_PATH)

best_batch_size = (
    results[
        (results["h3_res"] == TUNE_H3_RESOLUTION)
        & (results["time_interval_length"] == TUNE_TIME_INTERVAL_LENGTH)
    ]
    .sort_values(by="val_mse", ascending=True)["batch_size"]
    .iloc[0]
)

first_stage_hyperparameters = get_first_stage_hyperparameters()
batch_sizes = list(map(lambda x: x['batch_size'], first_stage_hyperparameters))
max_batch_size = max(batch_sizes)
min_batch_size = min(batch_sizes)

print(f"best batch_size: **{best_batch_size}** - min: {min_batch_size} - max: {max_batch_size}")


best batch_size: **256** - min: 32 - max: 512


We find that the best batch size when predicting demand is **256**.

In the second stage, we find the best architecture of the model by varying the number of layers, nodes and two common hidden activation functions, namely the rectified linear unit and the hyperbolic tangent.

In [6]:
get_hyperparameters = lambda : get_second_stage_hyperparameters(best_batch_size)
execute_stage(
    get_model_data,
    NN_SECOND_STAGE_DEMAND_RESULTS_PATH,
    get_hyperparameters,
    TUNE_H3_RESOLUTION,
    TUNE_TIME_INTERVAL_LENGTH,
)

  0%|          | 0/18 [00:00<?, ?it/s]

[19:21:58] batch_size: 256 - nodes_per_feature: 0.5 - n_layers: 1 - activation: relu - dropout: -1 # already trained
[19:21:58] batch_size: 256 - nodes_per_feature: 0.5 - n_layers: 1 - activation: tanh - dropout: -1 # already trained
[19:21:58] batch_size: 256 - nodes_per_feature: 0.5 - n_layers: 2 - activation: relu - dropout: -1 # already trained
[19:21:58] batch_size: 256 - nodes_per_feature: 0.5 - n_layers: 2 - activation: tanh - dropout: -1 # already trained
[19:21:58] batch_size: 256 - nodes_per_feature: 0.5 - n_layers: 3 - activation: relu - dropout: -1 # already trained
[19:21:58] batch_size: 256 - nodes_per_feature: 0.5 - n_layers: 3 - activation: tanh - dropout: -1 # already trained
[19:21:58] batch_size: 256 - nodes_per_feature: 1 - n_layers: 1 - activation: relu - dropout: -1 # already trained
[19:21:58] batch_size: 256 - nodes_per_feature: 1 - n_layers: 1 - activation: tanh - dropout: -1 # already trained
[19:21:58] batch_size: 256 - nodes_per_feature: 1 - n_layers: 2 - ac

In [7]:
results = get_results_df(NN_SECOND_STAGE_DEMAND_RESULTS_PATH)
best_model = (
    results[
        (results["h3_res"] == TUNE_H3_RESOLUTION)
        & (results["time_interval_length"] == TUNE_TIME_INTERVAL_LENGTH)
    ]
    .sort_values(by="val_mse", ascending=True)
    .iloc[0]
)

best_config = {
    "batch_size": best_model["batch_size"],
    "nodes_per_feature": best_model["nodes_per_feature"],
    "n_layers": best_model["n_layers"],
    "activation": best_model["activation"],
}

best_config


{'batch_size': 256,
 'nodes_per_feature': 1.0,
 'n_layers': 2,
 'activation': 'relu'}

In the third stage, we improve the generalizability by adding a dropout layer after every hidden layer and varying the dropout rate between 0 (no dropout) and 0.5 (dropout nodes half of the time).

In [8]:
get_hyperparameters = lambda : get_third_stage_hyperparameters(
    best_batch_size=best_model["batch_size"],
    best_nodes_per_feature=best_model["nodes_per_feature"],
    best_n_layers=best_model["n_layers"],
    best_activation=best_model["activation"],
)
execute_stage(
    get_model_data,
    NN_THIRD_STAGE_DEMAND_RESULTS_PATH,
    get_hyperparameters,
    TUNE_H3_RESOLUTION,
    TUNE_TIME_INTERVAL_LENGTH,
)


  0%|          | 0/5 [00:00<?, ?it/s]

[19:21:58] batch_size: 256 - nodes_per_feature: 1.0 - n_layers: 2 - activation: relu - dropout: 0 # already trained
[19:21:58] batch_size: 256 - nodes_per_feature: 1.0 - n_layers: 2 - activation: relu - dropout: 0.05 # already trained
[19:21:58] batch_size: 256 - nodes_per_feature: 1.0 - n_layers: 2 - activation: relu - dropout: 0.1 # already trained
[19:21:58] batch_size: 256 - nodes_per_feature: 1.0 - n_layers: 2 - activation: relu - dropout: 0.2 # already trained
[19:21:58] batch_size: 256 - nodes_per_feature: 1.0 - n_layers: 2 - activation: relu - dropout: 0.5 # already trained


In [9]:
results = get_results_df(NN_THIRD_STAGE_DEMAND_RESULTS_PATH)
best_dropout = (
	results[
		(results["h3_res"] == TUNE_H3_RESOLUTION)
		& (results["time_interval_length"] == TUNE_TIME_INTERVAL_LENGTH)
	]
	.sort_values(by="val_mse", ascending=True)["dropout"]
	.iloc[0]
)
best_config = {
	**best_config,
	"dropout": best_dropout,
}
best_config
		

{'batch_size': 256,
 'nodes_per_feature': 1.0,
 'n_layers': 2,
 'activation': 'relu',
 'dropout': 0.5}

Finally, we will use these best hyperparameters to train one model for each H3 and time resolution.

In [12]:
for h3_res, time_interval_length in tqdm(
    list(product(PREDICTIVE_H3_RESOLUTIONS, CALC_TIME_INTERVAL_LENGTHS))
    + ADDITIONAL_PREDICTIVE_RESOLUTIONS
):
    tqdm.write(f"executing h3_res: {h3_res}, time_interval_length: {time_interval_length}", end="\r")
    execute_stage(
        get_model_data,
        NN_FOURTH_STAGE_DEMAND_RESULTS_PATH,
        lambda: [best_config],
        h3_res,
        time_interval_length,
        test_phase=True,
        silent=True,
    )
    tqdm.write(f"executing h3_res: {h3_res}, time_interval_length: {time_interval_length} done")

  0%|          | 0/9 [00:00<?, ?it/s]

executing h3_res: 7, time_interval_length: 1 done
executing h3_res: 7, time_interval_length: 2 done
executing h3_res: 7, time_interval_length: 6 done
executing h3_res: 7, time_interval_length: 24 done
executing h3_res: 8, time_interval_length: 1 done
executing h3_res: 8, time_interval_length: 2 done
executing h3_res: 8, time_interval_length: 6 done
executing h3_res: 8, time_interval_length: 24 done
executing h3_res: 9, time_interval_length: 24 done


As an addition we will also train a model that predicts demand for origin-destination pairs instead of just origin. As the dimensionality of the data increases drastically when using origin-destination pairs, we will only use a low h3 resolution (7) and a large time interval (24h).

In [13]:
execute_stage(
	lambda _, __: get_demand_orig_dest_model_data(),
	NN_FOURTH_STAGE_DEMAND_RESULTS_PATH,
	lambda : [best_config],
	int(f"{ORIGIN_DESTINATION_H3_RESOLUTION}{ORIGIN_DESTINATION_H3_RESOLUTION}"),
	ORIGIN_DESTINATION_TIME_INTERVAL_LENGTH,
	test_phase=True,
	silent=True,
)

2022-07-07 19:22:04.598087: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-07 19:22:04.601683: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-07 19:22:04.601855: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-07 19:22:04.602365: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropri

In [19]:
results = get_results_df(NN_FOURTH_STAGE_DEMAND_RESULTS_PATH)
results

Unnamed: 0,h3_res,time_interval_length,batch_size,nodes_per_feature,n_layers,activation,dropout,train_duration,test_mse,test_rmse,test_mae,test_non_zero_mape,test_zero_accuracy
0,7,1,256,1.0,2,relu,0.5,186.338808,2.112337,1.453388,0.354515,0.415404,0.952907
1,7,2,256,1.0,2,relu,0.5,118.628595,17.64597,4.200711,1.260489,0.358513,0.917816
2,7,6,256,1.0,2,relu,0.5,84.846213,121.57857,11.026267,3.922868,0.269336,0.239784
3,7,24,256,1.0,2,relu,0.5,51.822149,385.978276,19.64633,8.396101,0.180705,0.426461
4,8,1,256,1.0,2,relu,0.5,827.665968,0.907417,0.952584,0.161567,1.0,0.939471
5,8,2,256,1.0,2,relu,0.5,439.822626,2.682283,1.637768,0.471627,0.519434,0.911342
6,8,6,256,1.0,2,relu,0.5,203.167135,14.918592,3.862459,1.138881,0.415263,0.922531
7,8,24,256,1.0,2,relu,0.5,58.10199,44.829146,6.695457,2.68533,0.267945,0.34455
8,9,24,256,1.0,2,relu,0.5,139.595377,8.76821,2.961116,1.323776,0.407589,0.888043
9,77,24,256,1.0,2,relu,0.5,492.197467,7.997945,2.828064,0.346599,0.496868,0.958253
