In [19]:
import pandas as pd
from tqdm.notebook import tqdm
from itertools import product

# for neural networks
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping

# for evaluation & preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
)
import sys, os
sys.path.append(os.path.abspath(os.path.join("..")))

In [20]:
import sys, os

sys.path.append(os.path.abspath('..'))
%load_ext autoreload
%autoreload 2
from modules.config import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
def get_nn_results_columns():
    return [
        "h3_res",
        "time_interval_length",
        "batch_size",
        "n_nodes",
        "n_layers",
        "activation",
        "dropout",
        "val_mse", "val_mae", "val_mape", "val_rmse",
        "test_mse", "test_mae", "test_mape", "test_rmse"
    ]

def get_results_df(path):
    if os.path.isfile(path):
        return pd.read_parquet(path)

    results = pd.DataFrame(columns=get_nn_results_columns())
    results.to_parquet(path)
    return results


def store_results(new_results, path):
    results = pd.read_parquet(path)
    results = pd.concat([results, new_results], ignore_index=True)
    results.to_parquet(path)

In [22]:
# this method will get model data for a specific h3 resolution and time interval length
def get_model_data(h3_res, time_interval_length):
    model_data = pd.read_feather(os.path.join(MODEL_DATA_DIR_PATH, f"{h3_res}_{time_interval_length}.feather"))
    return model_data

In [23]:
def split_and_scale_data(model_data):
    y = model_data["demand"]
    X = model_data.drop(columns=["demand"])

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.7, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_valid = scaler.transform(X_valid)
    X_test = scaler.transform(X_test)
    return X_train, X_valid, X_test, y_train, y_valid, y_test

In [34]:
def train_model(X_train, y_train, batch_size, n_nodes, n_layers, activation, dropout):
    model = Sequential()
    model.add(Dense(n_nodes, activation=activation, input_shape=(X_train.shape[1],)))
    for _ in range(n_layers):
        model.add(Dense(n_nodes, activation=activation))
        if dropout >= 0:
            model.add(Dropout(dropout))
    model.add(Dense(1, activation="relu"))

    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

    early_stopping = EarlyStopping(patience=5, min_delta=0.001)
    model.fit(X_train, y_train, epochs=10, batch_size=batch_size, validation_split=0.25, callbacks=[early_stopping])
    return model

In [25]:
def mean_average_percentage_error(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred) / y_true.mean()


def root_mean_squared_error(y_true, y_pred):
    return mean_squared_error(y_true, y_pred) ** 0.5

def get_evaluation_metrics(y_true, y_pred, prefix):
    return {
        prefix+'_mse': mean_squared_error(y_true, y_pred),
        prefix+'_mae': mean_absolute_error(y_true, y_pred),
        prefix+'_mape': mean_average_percentage_error(y_true, y_pred),
        prefix+'_rmse': root_mean_squared_error(y_true, y_pred),
    }

In [36]:
def get_model_meta_as_dict(model_meta):
    return {
        'batch_size': model_meta[0],
        'n_nodes': model_meta[1],
        'n_layers': model_meta[2],
        'activation': model_meta[3],
        'dropout': model_meta[4]
    }


def get_first_stage_hyperparameters(n_features):
    metas = {
        'batch_size': [128, 256, 512],
        'n_nodes': [n_features],
        'n_layers': [1],
        'activation': ['relu'],
        'dropout': [-1]
    }
    metas_list = list(product(*metas.values()))
    models_metas = [get_model_meta_as_dict(model_meta) for model_meta in metas_list]
    return models_metas


def get_second_stage_hyperparameters(n_features, best_batch_size):
    metas = {
        'batch_size': [best_batch_size],
        'n_nodes': [round(n_features*0.5), n_features, round(n_features*1.5)],
        'n_layers': [1, 2, 3],
        'activation': ['relu', 'sigmoid', 'tanh'],
        'dropout': [-1]
    }
    metas_list = list(product(*metas.values()))
    models_metas = [get_model_meta_as_dict(model_meta) for model_meta in metas_list]
    return models_metas


def get_third_stage_hyperparameters(best_batch_size, best_n_nodes, best_n_layers, best_activation):
    metas = {
        'batch_size': [best_batch_size],
        'n_nodes': [best_n_nodes],
        'n_layers': [best_n_layers],
        'activation': [best_activation],
        'dropout': [0, 0.05, 0.1, 0.2]
    }
    metas_list = list(product(*metas.values()))
    models_metas = [get_model_meta_as_dict(model_meta) for model_meta in metas_list]
    return models_metas

In [37]:
def model_was_already_trained(results_path, h3_res, time_interval_length, model_params):
    results = get_results_df(results_path)
    return results[
        (results['h3_res'] == h3_res) &
        (results['time_interval_length'] == time_interval_length) &
        (results['batch_size'] == model_params['batch_size']) &
        (results['n_nodes'] == model_params['n_nodes']) &
        (results['n_layers'] == model_params['n_layers']) &
        (results['activation'] == model_params['activation']) &
        (results['dropout'] == model_params['dropout'])
    ]['val_mape'].empty

In [38]:
def execute_stage(results_path, get_hyperparameters):
    model_data = get_model_data(h3_res, time_interval_length)
    model_data = model_data.iloc[:10000]
    X_train, X_valid, X_test, y_train, y_valid, y_test = split_and_scale_data(model_data)
    
    for model_params in get_hyperparameters(X_train.shape[1]):
        if not model_was_already_trained(results_path, h3_res, time_interval_length, model_params): continue

        model = train_model(X_train, y_train, model_params['batch_size'], model_params['n_nodes'], model_params['n_layers'], model_params['activation'], model_params['dropout'])
        y_pred_for_validation = model.predict(X_valid)
        y_pred_for_test = model.predict(X_test)

        results = {
            'h3_res': h3_res,
            'time_interval_length': time_interval_length,
            'batch_size': model_params['batch_size'],
            'n_nodes': model_params['n_nodes'],
            'n_layers': model_params['n_layers'],
            'activation': model_params['activation'],
            'dropout': model_params['dropout'],

            **get_evaluation_metrics(y_valid, y_pred_for_validation, 'val'),
            **get_evaluation_metrics(y_test, y_pred_for_test, 'test'),
        }
        store_results(pd.DataFrame(data=results, index=[0]), results_path)

In [39]:
for h3_res in PREDICTIVE_H3_RESOLUTIONS:
    for time_interval_length in CALC_TIME_INTERVAL_LENGTHS:
        get_hyperparameters=lambda n_features: get_first_stage_hyperparameters(n_features)
        execute_stage(NN_FIRST_STAGE_RESULTS_PATH, get_hyperparameters)

In [40]:
results = get_results_df(NN_FIRST_STAGE_RESULTS_PATH)

def get_best_batch_size(h3_res, time_interval_length):
    return results[(results['h3_res'] == h3_res) & (results['time_interval_length'] == time_interval_length)].sort_values(by="val_mape", ascending=True)['batch_size'].get(0)

In [41]:
for h3_res in PREDICTIVE_H3_RESOLUTIONS:
    for time_interval_length in CALC_TIME_INTERVAL_LENGTHS:
        best_batch_size = get_best_batch_size(h3_res, time_interval_length)

        get_hyperparameters=lambda n_features: get_second_stage_hyperparameters(n_features, best_batch_size=best_batch_size)
        execute_stage(NN_SECOND_STAGE_RESULTS_PATH, get_hyperparameters)

<class 'int'>
207 104
Train on 3675 samples, validate on 1225 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
<class 'int'>
207 104
Train on 3675 samples, validate on 1225 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
<class 'int'>
207 104
Train on 3675 samples, validate on 1225 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
<class 'int'>
207 104
Train on 3675 samples, validate on 1225 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
<class 'int'>
207 104
Train on 3675 samples, validate on 1225 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
<class 'int'>
207 104
Train on 3675 samples, validate on 1225 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoc

KeyboardInterrupt: 

In [None]:
aesults = get_results_df(NN_SECOND_STAGE_RESULTS_PATH)

def get_best_model(h3_res, time_interval_length):
    return results[(results['h3_res'] == h3_res) & (results['time_interval_length'] == time_interval_length)].sort_values(by="val_mape", ascending=True)

In [None]:
for h3_res in PREDICTIVE_H3_RESOLUTIONS:
    for time_interval_length in CALC_TIME_INTERVAL_LENGTHS:
        best_model = get_best_model(h3_res, time_interval_length)
        print(best_model)

        get_hyperparameters=lambda n_features: get_third_stage_hyperparameters(
            n_features,
            best_batch_size=best_model['batch_size'].get(0),
            best_n_nodes=best_model['n_nodes'].get(0),
            best_n_layers=best_model['n_layers'].get(0),
            best_activation=best_model['activation'].get(0)
        )
        execute_stage(NN_THIRD_STAGE_RESULTS_PATH, get_hyperparameters)