In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from datetime import datetime
import warnings
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import json
from tqdm.notebook import tqdm
warnings.filterwarnings('ignore')

mpl.rcParams['figure.figsize'] = (8, 6)
mpl.rcParams['axes.grid'] = False
tf.random.set_seed(13)

In [2]:
global_epoch_number = 100

## Reading data

In [3]:
# reading data
evo_data = pd.read_csv('data/demand_datasets/evo_demand.csv', index_col=0)
modo_data = pd.read_csv('data/demand_datasets/modo_demand.csv', index_col=0)
c2g_data = pd.read_csv('data/demand_datasets/c2g_demand.csv', index_col=0)

In [4]:
evo_data.columns

Index(['travels', 'tempC', 'precipMM', 'FeelsLikeC', 'uvIndex', 'visibility',
       'windspeedMiles', 'Blizzard', 'Clear', 'Cloudy', 'Fog', 'Heavy rain',
       'Heavy rain at times', 'Heavy snow', 'Light drizzle', 'Light rain',
       'Light rain shower', 'Light sleet', 'Light sleet showers', 'Light snow',
       'Mist', 'Moderate or heavy freezing rain',
       'Moderate or heavy rain shower', 'Moderate or heavy rain with thunder',
       'Moderate or heavy sleet', 'Moderate or heavy snow showers',
       'Moderate or heavy snow with thunder', 'Moderate rain',
       'Moderate rain at times', 'Moderate snow', 'Overcast', 'Partly cloudy',
       'Patchy heavy snow', 'Patchy light drizzle', 'Patchy light rain',
       'Patchy light rain with thunder', 'Patchy light snow',
       'Patchy moderate snow', 'Patchy rain possible', 'Patchy sleet possible',
       'Patchy snow possible', 'Sunny', 'Thundery outbreaks possible',
       'Torrential rain shower', 'Monday', 'Tuesday', 'Wednesday'

In [5]:
evo_data.drop(columns = ['hour_0', 'hour_1', 'hour_2', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23'], inplace=True)
modo_data.drop(columns = ['hour_0', 'hour_1', 'hour_2', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23'], inplace=True)
c2g_data.drop(columns = ['hour_0', 'hour_1', 'hour_2', 'hour_3',
       'hour_4', 'hour_5', 'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10',
       'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15', 'hour_16',
       'hour_17', 'hour_18', 'hour_19', 'hour_20', 'hour_21', 'hour_22',
       'hour_23'], inplace=True)

In [6]:
unievo_data = pd.DataFrame(evo_data.travels)
unimodo_data = pd.DataFrame(modo_data.travels)
unic2g_data = pd.DataFrame(c2g_data.travels)

In [7]:
init_period = '06-23'
end_period = '07-12'

evo_data = evo_data[(evo_data.index >= '2018-'+init_period) & (evo_data.index <= '2018-'+end_period)]
modo_data = modo_data[(modo_data.index >= '2018-'+init_period) & (modo_data.index <= '2018-'+end_period)]
c2g_data = c2g_data[(c2g_data.index >= '2017-'+init_period) & (c2g_data.index <= '2017-'+end_period)]

unievo_data = unievo_data[(unievo_data.index >= '2018-'+init_period) & (unievo_data.index <= '2018-'+end_period)]
unimodo_data = unimodo_data[(unimodo_data.index >= '2018-'+init_period) & (unimodo_data.index <= '2018-'+end_period)]
unic2g_data = unic2g_data[(unic2g_data.index >= '2017-'+init_period) & (unic2g_data.index <= '2017-'+end_period)]

## LSTM Data Preparation

In [8]:
def sup_learning_formatter(data, past_lags, future_steps, train_split):
    X = []
    y = []

    norm_data  = data.values

    for n in range(len(data) - past_lags - future_steps):
        X.append(norm_data[n : n + past_lags])
        y.append(data.travels.values[n + past_lags : n + past_lags + future_steps])
    return np.array(X), np.array(y)

In [9]:
def train_val_test_splitter(data, splits):
    locs = [int(len(data)*n) for n in splits]
    return data[:locs[0]], data[locs[0]:locs[1]], data[locs[1]:], data[0].shape

In [10]:
def eval_model(y, y_hat):
    evaluation = {}
    evaluation["RMSE"] = np.sqrt(mean_squared_error(y, y_hat))
    evaluation["MAE"] = mean_absolute_error(y, y_hat)
    evaluation["R2"] = r2_score(y, y_hat)

    return evaluation

In [11]:
def persistance_model(X, timesteps):
    y_hat = []
    for x in X:
        y_hat.append(np.array([x[-1][0] for _ in range(timesteps)]))

    return np.array(y_hat)

# Training Models

In [12]:
def plot_train_history(history, title, save_file=False):
    history = pd.DataFrame(history.history)

    history.plot(figsize=(8, 5))
    plt.grid(True)
    plt.savefig("plots\\" + title.replace(" ", "_") + ".png", bbox_inches='tight') if save_file else print()
    plt.show()

Next, will be generated the model for each dataset

## Grid Search

In [15]:
class GridSearchLSTM:
    def __init__(self):
        self.evaluations = {}
        self.best_estimator = None

    def search(self, feature_dict, data, verbose=1):
        possibilities_list = self._create_feature_dict(feature_dict)
        current_evaluations = []
        for test in tqdm(possibilities_list):
            model, hist, test_data, evaluation = self.run_lstm(data, 24, 12, (0.6, 0.8), verbose=verbose, **test)
            validation_eval = {key:value[-1] for key, value in hist.history.items()}
            current_evaluations.append([test, validation_eval])

        current_evaluations.sort(key=lambda x: x[1]["val_loss"])            
        self.evaluations = current_evaluations
        self.best_estimator = current_evaluations[0][0]

    def _create_feature_dict(self, feature_dict):
        return self._create_feature_dict_recurse({}, feature_dict, list(feature_dict.keys()))

    def _create_feature_dict_recurse(self, start_dict, feature_dict, remaining_keys):
        if len(remaining_keys) == 0:
            return [start_dict]
        new_feature_dict = feature_dict.copy()
        returned_list = []
        del new_feature_dict[remaining_keys[0]]
        for item in feature_dict[remaining_keys[0]]:
            new_start_dict = start_dict.copy()
            new_start_dict[remaining_keys[0]] = item
            returned_list += self._create_feature_dict_recurse(new_start_dict, new_feature_dict, remaining_keys[1:])
        return returned_list


    def run_lstm(self, data, past_lags, future_steps, splits, node_number=50,
                 epochs=10, batch_size=10000, loss='mae', dropout=0.5, layer_count=2, verbose=1):
        
        X, y = sup_learning_formatter(data, past_lags, future_steps, splits[0])
        X_train, X_val, X_test, X_shape = train_val_test_splitter(X, splits)
        y_train, y_val, y_test, y_shape = train_val_test_splitter(y, splits)


        train = tf.data.Dataset.from_tensor_slices((X_train, y_train))
        train = train.cache().shuffle(batch_size).batch(batch_size).repeat()

        val = tf.data.Dataset.from_tensor_slices((X_val, y_val))
        val = val.batch(batch_size).repeat()

        model = tf.keras.models.Sequential()

        if(layer_count == 1):
            model.add(tf.keras.layers.LSTM(node_number,
                                    input_shape=X_shape))
            model.add(tf.keras.layers.Dropout(dropout))
        else:
            model.add(tf.keras.layers.LSTM(node_number, return_sequences=True,
                                    input_shape=X_shape))
            model.add(tf.keras.layers.Dropout(dropout/2))

            for _ in range(layer_count - 2):
                model.add(tf.keras.layers.LSTM(node_number, return_sequences=True, activation='relu'))

            model.add(tf.keras.layers.LSTM(node_number, activation='relu'))
        
        model.add(tf.keras.layers.Dense(12))
        
        def rmse(y_true, y_pred):
            return tf.sqrt(tf.reduce_mean((y_true - y_pred)**2))

        model.compile(optimizer=tf.keras.optimizers.RMSprop(clipvalue=1.0), loss=loss, metrics=[rmse])
        
        history = model.fit(train, epochs=epochs, steps_per_epoch=50,
                            validation_data=val, validation_steps=50, verbose=verbose
                            )
        y_hat_test = model.predict(X_test)
        evaluation = eval_model(y_test, y_hat_test)

        return model, history, (X_test, y_test), evaluation


In [16]:
grid_search = GridSearchLSTM()

In [None]:
feature_dict = {"epochs":[30, 50], "layer_count":[3, 4, 5], "node_number":[60, 80, 100], "dropout":[0.5, 0.7]}

grid_search.search(feature_dict, evo_data)

In [17]:
grid_search.evaluations

[[{'epochs': 30, 'layer_count': 5, 'node_number': 100, 'dropout': 0.7},
  {'loss': 61.98115303039551,
   'rmse': 123.183075,
   'val_loss': 88.98165130615234,
   'val_rmse': 113.064804}],
 [{'epochs': 30, 'layer_count': 4, 'node_number': 60, 'dropout': 0.7},
  {'loss': 48.69632125854492,
   'rmse': 113.06194,
   'val_loss': 90.21286010742188,
   'val_rmse': 113.73388}],
 [{'epochs': 30, 'layer_count': 3, 'node_number': 60, 'dropout': 0.7},
  {'loss': 50.08780746459961,
   'rmse': 114.24129,
   'val_loss': 93.33883666992188,
   'val_rmse': 116.08402}],
 [{'epochs': 30, 'layer_count': 5, 'node_number': 60, 'dropout': 0.7},
  {'loss': 49.519894714355466,
   'rmse': 113.607605,
   'val_loss': 94.20116424560547,
   'val_rmse': 115.09389}],
 [{'epochs': 50, 'layer_count': 3, 'node_number': 100, 'dropout': 0.7},
  {'loss': 36.9069128036499,
   'rmse': 99.26817,
   'val_loss': 96.38170623779297,
   'val_rmse': 117.6753}],
 [{'epochs': 50, 'layer_count': 5, 'node_number': 80, 'dropout': 0.7},
 

Evaluations on Unievo

In [17]:
feature_dict = {"epochs":[30, 50], "layer_count":[3, 4, 5], "node_number":[60, 80, 100], "dropout":[0.5, 0.7]}

grid_search.search(feature_dict, unievo_data, verbose=0)

HBox(children=(FloatProgress(value=0.0, max=36.0), HTML(value='')))

[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]
[136, 181]



In [18]:
grid_search.evaluations

[[{'epochs': 30, 'layer_count': 5, 'node_number': 60, 'dropout': 0.5},
  {'loss': 49.16347648620606,
   'rmse': 111.650566,
   'val_loss': 91.99232482910156,
   'val_rmse': 114.38819}],
 [{'epochs': 30, 'layer_count': 5, 'node_number': 80, 'dropout': 0.7},
  {'loss': 60.79417556762695,
   'rmse': 121.694855,
   'val_loss': 92.54352569580078,
   'val_rmse': 113.08544}],
 [{'epochs': 30, 'layer_count': 3, 'node_number': 100, 'dropout': 0.7},
  {'loss': 49.68187171936035,
   'rmse': 106.38354,
   'val_loss': 96.49043273925781,
   'val_rmse': 119.60009}],
 [{'epochs': 30, 'layer_count': 4, 'node_number': 80, 'dropout': 0.7},
  {'loss': 48.58590560913086,
   'rmse': 108.54301,
   'val_loss': 98.93069458007812,
   'val_rmse': 121.50648}],
 [{'epochs': 50, 'layer_count': 3, 'node_number': 60, 'dropout': 0.7},
  {'loss': 41.57910179138184,
   'rmse': 103.59932,
   'val_loss': 99.25018310546875,
   'val_rmse': 122.40756}],
 [{'epochs': 30, 'layer_count': 5, 'node_number': 80, 'dropout': 0.5},
 

In [18]:
grid_search.evaluations

[[{'epochs': 50, 'layer_count': 4, 'node_number': 80, 'dropout': 0.7},
  {'RMSE': 162.35465398209146,
   'MAE': 98.66833072296089,
   'R2': 0.12723739996712005}],
 [{'epochs': 50, 'layer_count': 5, 'node_number': 80, 'dropout': 0.7},
  {'RMSE': 167.96424522934947,
   'MAE': 111.04613152448682,
   'R2': 0.0717189666230026}],
 [{'epochs': 30, 'layer_count': 4, 'node_number': 80, 'dropout': 0.7},
  {'RMSE': 169.5624757862915,
   'MAE': 114.56481393091921,
   'R2': 0.0519735506769354}],
 [{'epochs': 30, 'layer_count': 4, 'node_number': 80, 'dropout': 0.5},
  {'RMSE': 178.52453876247571,
   'MAE': 114.95373132738514,
   'R2': -0.05765273791179789}],
 [{'epochs': 50, 'layer_count': 4, 'node_number': 80, 'dropout': 0.5},
  {'RMSE': 169.96668573362294,
   'MAE': 115.84041734709257,
   'R2': 0.04534592232713058}],
 [{'epochs': 50, 'layer_count': 5, 'node_number': 80, 'dropout': 0.5},
  {'RMSE': 170.70319108707923,
   'MAE': 116.51601840102155,
   'R2': 0.04077082728880005}],
 [{'epochs': 30, 'l

In [16]:
grid_search.evaluations

[[{'epochs': 30, 'layer_count': 3, 'node_number': 80, 'dropout': 0.5},
  {'RMSE': 156.89216176717633,
   'MAE': 90.44141034889911,
   'R2': 0.18229860225635006}],
 [{'epochs': 50, 'layer_count': 3, 'node_number': 80, 'dropout': 0.7},
  {'RMSE': 160.75687936313233,
   'MAE': 93.47899978402732,
   'R2': 0.14363741376468986}],
 [{'epochs': 30, 'layer_count': 3, 'node_number': 80, 'dropout': 0.3},
  {'RMSE': 161.24735397338654,
   'MAE': 95.41294302629387,
   'R2': 0.1369881539715284}],
 [{'epochs': 30, 'layer_count': 3, 'node_number': 50, 'dropout': 0.5},
  {'RMSE': 160.3776318433857,
   'MAE': 96.25941219191621,
   'R2': 0.1456387522665716}],
 [{'epochs': 30, 'layer_count': 2, 'node_number': 50, 'dropout': 0.3},
  {'RMSE': 161.92682414714625,
   'MAE': 98.33593439796694,
   'R2': 0.13022077235481136}],
 [{'epochs': 50, 'layer_count': 2, 'node_number': 50, 'dropout': 0.7},
  {'RMSE': 164.11180620426185,
   'MAE': 98.4271030322365,
   'R2': 0.10759922701138487}],
 [{'epochs': 50, 'layer_co