In [1]:
# If the libraries are not yet installed, they can be installed in this notebook using commands similar to the below
# %conda install numpy
# %conda install pandas
# %conda install matplotlib
# %conda install scikit-learn
# %conda install -c conda-forge lightgbm 
# %conda install -c conda-forge swifter
# %conda install -c conda-forge bayesian-optimization 
# %conda install -c conda-forge scipy
# %conda install joblib
# %conda install tdqm

# Something like the following may also work if the above does not
# import sys
# !conda install --yes --prefix {sys.prefix} numpy
# !conda install --yes --prefix {sys.prefix} pandas
# !conda install --yes --prefix {sys.prefix} scikit-learn
# !conda install -c conda-forge --yes --prefix {sys.prefix} lightgbm
# !conda install -c conda-forge --yes --prefix {sys.prefix} swifter
# !conda install -c conda-forge --yes --prefix {sys.prefix} bayesian-optimization 
# !conda install -c conda-forge --yes --prefix {sys.prefix} scipy 
# !conda install --yes --prefix {sys.prefix} joblib
# !conda install --yes --prefix {sys.prefix} tdqm

# To install a specific version, add the version to the install command
# E.g., %conda install numpy=1.20.3

# If all else fails, use pip or follow additional advice such as found at
# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/

# If your plan to use pip (especially if you are not working within a specified conda environment), 
# the pip commands might look like:
# pip install numpy
# pip install pandas
# pip install scikit-learn
# pip install lightgbm
# pip install swifter
# pip install bayesian-optimization 
# pip install scipy
# pip install joblib
# pip install tdqm

# To install a specific version, add the version to the pip install command
# E.g., pip install numpy==1.20.3

In [2]:
# Load libraries
import time
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import itertools
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import glob
from lightgbm import LGBMRegressor
import random
from sklearn.model_selection import ParameterSampler
import scipy
import gc
from joblib import Parallel, delayed
import contextlib
import joblib
from tqdm import tqdm
from bayes_opt import BayesianOptimization
import os

In [3]:
# Set random seed for reproducibility
np.random.seed(54321)
random.seed(54321)

# Read in Data and Prepare for Modeling

In [4]:
# Create an empty list to hold the dataframes of highways england data
total_df_list = list()

# Loop through the files, sorted in alphabetical order
# Read them into a df, make sure they are sorted by timestamp, and append to the list
for fname in sorted(glob.glob("Data/Unseen Sensor/Processed/*.csv")):
    print("Reading {}".format(fname))
    df = pd.read_csv(fname) #, parse_dates=['timestamp'], index_col=['timestamp'])
    df = df.sort_values(by="timestamp")
    total_df_list.append(df)

Reading Data/Unseen Sensor/Processed/A19-9336-1_Northbound_2019_Processed.csv
Reading Data/Unseen Sensor/Processed/A66-9521-1_Westbound_Processed.csv
Reading Data/Unseen Sensor/Processed/M40-7048-2_Southbound_Processed.csv
Reading Data/Unseen Sensor/Processed/M62-2056A_Eastbound_Processed.csv


In [5]:
# Read in the start and end points csv, and subtract 1 to deal with index differences between R and python
start_end = pd.read_csv("start_end_points_unseen.csv")
start_end["start"] = start_end["start"] - 1
start_end["end"] = start_end["end"]

In [6]:
# Create an empty list to hold the subset data frames (those with only 12 weeks of data per highway)
subset_df_list = list()

In [7]:
# For each df in our original total df list
for idx, df in enumerate(total_df_list):
        
    # Filter the timeframe based on the start_end_points csv files
    subset_df = df.iloc[start_end.iloc[idx,0]:start_end.iloc[idx,1], ]\
    .reset_index(drop=True).reset_index(drop=False)\
    .rename(columns={"index":"rn"})
    
    # Create a new field called train_val_test to differentiate each set of data
    subset_df["train_val_test"] = np.where(subset_df["rn"]<(96*7*8),
                                           "train",
                                           np.where(subset_df["rn"]<(96*7*10),
                                                    "val",
                                                    "test"
                                                   )
                                       )
    
    # Append to list
    subset_df_list.append(subset_df)

In [8]:
# Create a list of df's with only fields we need

# Initialize empty list
model_df_list = list()

# For df in subset list
for df in subset_df_list:
       
    # Extract the timestamp, the volume, and the train_val_test assignment
    model_df = df[['timestamp', 'total_volume', "train_val_test"]]\
    .rename(columns={'timestamp':'start', 'total_volume':'target'})
    
    # Append this df to the new list
    model_df_list.append(model_df)

## Create Lag Emebedded Matrices for each TS

In [9]:
# Set our final lag value to be 840
lag_n = 840

In [10]:
# # Lag embed the data frames and save to a list
lag_embed_df_list = list()

for df in model_df_list:
    # For each df in our list
    for n in range(1, (lag_n+1)):
        # For each lag level, up to 840
        # Create a new column called target-n
        name = f"target-{n}"
        # Save the target shifted n values into this colume
        df[name] = df['target'].shift(n)
    # Append to list
    lag_embed_df_list.append(df)

  df[name] = df['target'].shift(n)


In [11]:
# Split the lag embedded list into train, val, and test lists

# First, initialize empty lists for each train, val, and test
train_df_list = list()
val_df_list = list()
test_df_list = list()

for i in range(len(lag_embed_df_list)):
    # For each df in our list
    df = lag_embed_df_list[i].copy()

    # Add a ts_index of i+1 to join with clustering data from R
    df['ts_index'] = i + 1
    
    # Subset into train, val, and test df's based on the train_val_test_field
    train_df = df.query("train_val_test == 'train'").copy()
    val_df = df.query("train_val_test=='val'").copy()
    test_df = df.query("train_val_test=='test'").copy()
    
    # Append to appropriate lists
    train_df_list.append(train_df)
    val_df_list.append(val_df)
    test_df_list.append(test_df)

In [12]:
# Concat all dfs from the lists together to create one full train, val, and test df
train_df_full = pd.concat(train_df_list)
val_df_full = pd.concat(val_df_list)
test_df_full = pd.concat(test_df_list)

In [13]:
# Drop unneeded columns
train_df_full.drop(columns=['start', 'train_val_test'], inplace=True)
val_df_full.drop(columns=['start', 'train_val_test'], inplace=True)
test_df_full.drop(columns=['start', 'train_val_test'], inplace=True)

In [14]:
# Append the training and validation data together for later use
train_val_df_full = train_df_full.append(val_df_full)

# Model Each Time Series Using Default LightGBM Parameters

In [15]:
# Define a function to train a light gbm model
def train_local_lgbm(m, data, n):
    """Function which takes a time series index m, a training data frame, and a lag value n and trains a model"""
    
    # Create y and X data frames from the trianing data for ts_index m and lag embedding n 
    y_train_sub = data.query("ts_index==@m").iloc[n:,0]
    X_train_sub = data.query("ts_index==@m").iloc[n:,0:(n+1)].iloc[:,1:]
    
    # Create the model object and fit it
    mod_sub = LGBMRegressor(boosting_type='goss', random_state=54321)
    mod_sub.fit(X_train_sub, y_train_sub)
    
    # Return the fitted model
    return mod_sub

In [16]:
# Code for progress bar:
# https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution
# This allows us to print a progress bar while running parallel loops using joblib 

@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

In [18]:
# Loop through the list of time series in parallel and train a LGBM model for each, saving models to a list
with tqdm_joblib(tqdm(desc="Local Models - Default Params", total=4)) as progress_bar:
    results = Parallel(n_jobs=4)(delayed(train_local_lgbm)(i, train_val_df_full, lag_n) for i in range(1, 5))

Local Models - Default Params: 100%|██████████████| 4/4 [00:06<00:00,  1.67s/it]


In [19]:
# Function to make predicitons on the test data
def predict_test_lgbm(model, m, data, n):
    """Function which takes in a trained model, time series index, test data frame, and lag embedding
    and returns a dictionary of model test prediction performance"""
    
    # Create y and X data frames for the given ts index and lag embedding
    y_test_sub = data.query("ts_index==@m").iloc[:,0]
    X_test_sub = data.query("ts_index==@m").iloc[:,1:(n+1)]
    
    # Predict on the test data
    test_preds_sub = model.predict(X_test_sub)
    
    # Compute the mean of the true test data as well as the mae and rmse of the predictions
    test_mean = np.mean(y_test_sub)
    test_mae = mean_absolute_error(y_test_sub, test_preds_sub)
    test_rmse = mean_squared_error(y_test_sub, test_preds_sub, squared=False)
    
    # Save the mean, mae, and rmse into a dictionary
    pred_perf = {"mean": test_mean, "mae": test_mae, "rmse": test_rmse}
    
    # Return the dictionary
    return pred_perf

In [20]:
# Loop through all time series and save the prediction performance dictionaries for each to a list
with tqdm_joblib(tqdm(desc="Local Models - Test Preds", total=4)) as progress_bar:
    test_results = Parallel(n_jobs=4)(delayed(predict_test_lgbm)(results[i], i+1, test_df_full, lag_n) for i in range(4))

Local Models - Test Preds: 100%|██████████████████| 4/4 [00:00<00:00,  8.58it/s]


In [21]:
# Create a data frame from that list of dictionaries
local_model_test_perf = pd.DataFrame(test_results)

In [23]:
# Add scaled metrics to that data frame
local_model_test_perf['nrmse'] = local_model_test_perf['rmse']/local_model_test_perf['mean']
local_model_test_perf['smae'] = local_model_test_perf['mae']/local_model_test_perf['mean']

In [24]:
# Print the means of the prediction performance metrics
local_model_test_perf.mean()

mean     377.058594
mae       25.965267
rmse      38.541254
nrmse      0.123419
smae       0.083644
dtype: float64

In [26]:
# Create an empty dictionary to store residuals in
base_mod_resiudal_dict = dict()

# Loop through each time series index
for i in range(1, 5):
    # Extract the y and X train for each index
    y_train_sub = train_val_df_full.query("ts_index==@i").iloc[:,0:(lag_n+1)].dropna().iloc[:,0].copy()
    X_train_sub = train_val_df_full.query("ts_index==@i").iloc[:,0:(lag_n+1)].dropna().iloc[:,1:].copy()
    
    # Make predictions on the training data
    train_preds_sub = results[i-1].predict(X_train_sub)
    
    # Compute residuals and convery to list
    res_sub = (y_train_sub - train_preds_sub).to_list()
    
    # Save that list into the dictionary, with key equal to the ts index
    base_mod_resiudal_dict[i] = res_sub

In [27]:
# Create an emptry dictionary for test preds
base_mod_test_preds = dict()

# For each ts index
for i in range(1,5):
    # Create the X data frame from the test data for that ts
    X_test_sub = test_df_full.query("ts_index==@i").iloc[:,1:(lag_n+1)]
    
    # Predict on the test data
    test_pred_sub = results[i-1].predict(X_test_sub)
    
    # Save those preds to the dictionary
    base_mod_test_preds[i] = test_pred_sub

In [28]:
# Set n_boot to 1000
n_boot = 1000

# Create an empty df to store pred intervals
pred_int_df = pd.DataFrame()

# Set random seed for reproducibility
np.random.seed(54321)
random.seed(54321)

# Loop through all ts indexes
for i in range(1,5):

    # Print i to monitor progress
    print(i)
    
    # Get the true/target value of y for the test data for that ts
    y_test_sub = test_df_full.query("ts_index==@i").iloc[:,0]
    
    # Create empty lists to store PIs in
    percent_95_lo_ls = list()
    percent_95_hi_ls = list()
    percent_80_lo_ls = list()
    percent_80_hi_ls = list()
    
    # Loop through the number of observations in the test set
    for j in range(1344):
        
        # Grab the test pred for the given ts index and observation number
        pred = base_mod_test_preds[i][j]
        # Sample n_boot times from the appropriate model's residuals
        resid_boot = np.random.choice(base_mod_resiudal_dict[i], size=n_boot, replace=True)
        # Add the test pred to the residuals
        resid_preds = pred+resid_boot
        
        # Compute the percentiles of resid_preds for the 95% PI
        percent_95_lo = np.percentile(resid_preds, 2.5)
        percent_95_hi = np.percentile(resid_preds, 97.5)
        percent_95_lo_ls.append(percent_95_lo)
        percent_95_hi_ls.append(percent_95_hi)
        
        # Compute the percentiles of resid_preds for the 80% PI
        percent_80_lo = np.percentile(resid_preds, 10)
        percent_80_hi = np.percentile(resid_preds, 90)
        percent_80_lo_ls.append(percent_80_lo)
        percent_80_hi_ls.append(percent_80_hi)
    
    # Create a temp data frame with the ts_index, true values, and PIs
    pred_int_df_sub = pd.DataFrame({"ts_index": i,
                                    "actual": y_test_sub,
                                    "pct_95_lo": percent_95_lo_ls,
                                    "pct_95_hi": percent_95_hi_ls,                                   
                                    "pct_80_lo": percent_80_lo_ls,
                                    "pct_80_hi": percent_80_hi_ls
                                   })
    
    # Append to the full data frame
    pred_int_df = pred_int_df.append(pred_int_df_sub)

1
2
3
4


In [29]:
# Print head to sanity check
pred_int_df.head()

Unnamed: 0,ts_index,actual,pct_95_lo,pct_95_hi,pct_80_lo,pct_80_hi
6720,1,221.0,233.558565,276.405711,240.604273,268.331318
6721,1,323.0,223.017549,265.066333,230.213948,259.488536
6722,1,298.0,279.402182,321.410465,287.489184,315.676443
6723,1,305.0,256.025554,298.615415,264.264211,291.793449
6724,1,238.0,267.300907,309.638195,274.520606,303.200311


In [32]:
# Create a function to compute the interval score
def interval_score(true_values, lower, upper, interval_range):
    """ Function which takes in the true values, the upper and lower bounds of PIs, and the PI level (e.g., 90%)
        and from these inputs, computes the interval score for each prediction
    """
    
    # Compute alpha from the interval range
    alpha = 1-interval_range
    
    # Save the upper, lower, and true_values as numpy arrays for computation purposes
    upper = np.array(upper)
    lower = np.array(lower)
    true_values = np.array(true_values)
    
    # Compute the lower component of the interval score - just a boolean for true below interval
    def lower_ind(true,low):
        if true<low:
            return 1
        else:
            return 0
        
    # Computer the upper component of the interval score - similar boolean for true above interval
    def upper_ind(true,up):
        if true>up:
            return 1
        else:
            return 0
        
    # Computer the actual score for each obsveration - formula here: https://epiforecasts.io/scoringutils/reference/interval_score.html
    scores = (upper-lower) + (2/alpha)*(lower-true_values)*(lower > true_values) + (2/alpha)*(true_values-upper)*(true_values > upper)
    
    # Return the scores array
    return scores

In [33]:
# Compute the 80% and 95% PI scores using the above function
pred_int_df['int_80_score'] = interval_score(pred_int_df['actual'], 
                                             pred_int_df['pct_80_lo'], 
                                             pred_int_df['pct_80_hi'],
                                             0.8
                                            )

pred_int_df['int_95_score'] = interval_score(pred_int_df['actual'], 
                                             pred_int_df['pct_95_lo'], 
                                             pred_int_df['pct_95_hi'],
                                             0.95
                                            )

In [34]:
# Print the mean of the PI scores
pred_int_df.mean()

ts_index          2.500000
actual          377.058594
pct_95_lo       343.454157
pct_95_hi       414.783086
pct_80_lo       356.117900
pct_80_hi       402.027249
int_80_score    158.110497
int_95_score    357.412391
dtype: float64

In [35]:
# Group the PI results by time series index, compute the mean of the interval scores and true data
pred_int_df_grouped = pred_int_df.groupby("ts_index")\
.agg({"int_95_score":"mean", "int_80_score":"mean", "actual":"mean"}).reset_index()

# Compute the scaled interval score
pred_int_df_grouped['int_95_score_scaled'] = pred_int_df_grouped['int_95_score']/pred_int_df_grouped['actual']
pred_int_df_grouped['int_80_score_scaled'] = pred_int_df_grouped['int_80_score']/pred_int_df_grouped['actual']

In [36]:
# Print the means of the scaled interval scores
pred_int_df_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.506256
int_95_score_scaled    1.175555
dtype: float64

# Local Models with Bayesian Optimization

In [37]:
# Define function to perform Bayesian optimization
def optimize_lgbm_w_bayes(i, lag_n, train_df, val_df):
    """Function which takes in a time series index i, a lag embedding lag_n, and a train and validation
    data frame and which returns the best model params found using bayesian optimization"""

    # Subset the input train data into X and y data frames for the provided index i
    y_train_bayes = train_df.query("ts_index==@i").iloc[:,0:(lag_n+1)].dropna().iloc[:,0].copy()
    X_train_bayes = train_df.query("ts_index==@i").iloc[:,0:(lag_n+1)].dropna().iloc[:,1:].copy()
    
    # Get the validation data for the provided ts_index i
    val_df_bayes = val_df.query("ts_index==@i").copy()
   
    # Set up space of lgbm params to explore
    bayes_param_ss = {
    "n_estimators": (100, 1000),
    "max_depth": (2, 25),
    "lambda_l1": (0, 1),
    "lambda_l2": (0, 1),
    "num_leaves": (10, 150),
    "colsample_bytree": (0.1, 1),
    "learning_rate": (0.00001, 0.5)
    }
    

    # Define a function to compute validation set predictions
    def val_predict(model, val_df):
        """Function takes in a trained model and validation data and returns normalized rmse for preds on the 
        validation data"""
        
        # Subset the validation data frame into y and X data frames
        y_val_sub = val_df.iloc[:,0]
        X_val_sub = val_df.iloc[:,1:(lag_n+1)]
        # Compute the mean of the true y values
        val_mean_sub = np.mean(y_val_sub)

        # Make model predictions
        val_preds_sub = model.predict(X_val_sub)

        # Compute rmse on the predictions, and then divide by the mean to get nrmse
        val_rmse_sub = mean_squared_error(y_val_sub, val_preds_sub, squared=False)
        val_nrmse_sub = val_rmse_sub/val_mean_sub

        # Return normalized rmse
        return val_nrmse_sub
    
    
    # Define function to perform the bayesian optimization
    def lgbm_eval_for_bayes(n_estimators,
                        max_depth,
                        lambda_l1, 
                        lambda_l2,
                        num_leaves,
                        colsample_bytree,
                        learning_rate
                       ):
        """Function which takes in parameter values as inputs and returns a value to be maximized by the
        Bayesian optimizer. In this case, we return -1*validation_nrmse as this allows us to minimize the
        validation nrmse"""
        
        # Set the proper boosting type
        params = {"boosting_type": "goss"
                 }
        
        # Set the params dictionary to include all input params
        # For n_estimators, max_depth, and num_leaves, round and cast as int - this is what the lgbm model requires
        params["n_estimators"] = int(round(n_estimators))
        params["max_depth"] = int(round(max_depth))
        params["reg_alpha"] = max(lambda_l1, 0)
        params["reg_lambda"] = max(lambda_l2, 0)
        params["num_leaves"] = int(round(num_leaves))
        params["colsample_bytree"] = colsample_bytree
        params["learning_rate"] = learning_rate

        # Create the model object, setting a constant random_state for reproducibility, and fit the model
        mod = LGBMRegressor(**params, random_state=54321)  
        mod.fit(X_train_bayes, y_train_bayes)

        # Compute the performance on the validation data and multiple by -1
        val_perf = -1*np.mean(val_predict(mod, val_df_bayes))

        # Return the negative validation nrmse
        return val_perf

    # Create an optimizer object, again setting random_state
    optimizer = BayesianOptimization(lgbm_eval_for_bayes,
                                     bayes_param_ss,
                                     random_state=54321)
    # Run the optimizer with 5 random initialization points and 25 further iterations
    optimizer.maximize(init_points=5, n_iter=25)
    
    # Return the best params found by the optimizer
    return optimizer.max['params']

In [38]:
# In parallel, for all the time series in our list, 
# loop through and run the Bayesian optimzer. Save the params for each model to a list
with tqdm_joblib(tqdm(desc="Local Models - Bayesian Optim", total=4)) as progress_bar:
    local_bayes_results = Parallel(n_jobs=4)(delayed(optimize_lgbm_w_bayes)(i, 
                                                                            lag_n, 
                                                                            train_df_full,
                                                                            val_df_full
                                                                           ) for i in range(1, 5))

Local Models - Bayesian Optim:  50%|██████▌      | 2/4 [05:17<04:42, 141.12s/it]

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1897  [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [0m 2       [0m | [0m-0.2078  [0m | [0m 0.7999  [0m | [0m 0.4889  [0m | [0m 0.05053 [0m | [0m 0.2692  [0m | [0m 2.954   [0m | [0m 827.3   [0m | [0m 149.3   [0m |
| [0m 3       [0m | [0m-0.2213  [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.2431  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.2064  [0m | [0m 0.746

Local Models - Bayesian Optim:  75%|██████████▌   | 3/4 [05:22<01:18, 78.81s/it]

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.2058  [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [0m 2       [0m | [0m-0.2081  [0m | [0m 0.7999  [0m | [0m 0.4889  [0m | [0m 0.05053 [0m | [0m 0.2692  [0m | [0m 2.954   [0m | [0m 827.3   [0m | [0m 149.3   [0m |
| [0m 3       [0m | [0m-0.234   [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.2523  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.2069  [0m | [0m 0.746

Local Models - Bayesian Optim: 100%|█████████████| 4/4 [07:20<00:00, 110.15s/it]


In [39]:
# Define function to train a final model
def train_best_lgbm_local(m, data, param_ls, lag_n):
    """Function which takes inputs: m, the ts index, data, the full training_validation data frame,
    param_ls, the list of params from which to choose, and lag_n, the lag embedding of the data,
    and which returns a fitted model"""
    
    # Subset the data to the appropriate ts index and lag and split into X and y data frames
    y_train_sub = data.query("ts_index==@m").iloc[:,0:(lag_n+1)].dropna().iloc[:,0].copy()
    X_train_sub = data.query("ts_index==@m").iloc[:,0:(lag_n+1)].dropna().iloc[:,1:].copy()
    
    # Extract the params for this model
    params = param_ls[m-1]
    # Round and cast to int the params which must be integers
    params['n_estimators'] = int(round(params['n_estimators']))
    params['num_leaves'] = int(round(params['num_leaves']))
    params['max_depth'] = int(round(params['max_depth']))
    
    # Create and fit the model object
    mod_sub = LGBMRegressor(boosting_type="goss", **params, random_state=54321)
    mod_sub.fit(X_train_sub, y_train_sub)

    # Return the fitted model
    return mod_sub

In [40]:
# In parallel, lopo through the list of ts indexes, model params, etc., and train the best local model for each 
# ts in our data set. Save to a list
with tqdm_joblib(tqdm(desc="Local Models - Bayesian Optim", total=4)) as progress_bar:
    final_local_bayes_models = Parallel(n_jobs=4)(delayed(train_best_lgbm_local)(i, 
                                                                                 train_val_df_full, 
                                                                                 local_bayes_results,
                                                                                 lag_n
                                                                                ) for i in range(1, 5))

Local Models - Bayesian Optim: 100%|██████████████| 4/4 [00:15<00:00,  3.98s/it]


In [41]:
# In parallel, loop through all of our trained models and test sets and compute prediction performance
with tqdm_joblib(tqdm(desc="Local Models Bayes - Test Preds", total=4)) as progress_bar:
    test_results_bayes = Parallel(n_jobs=4)(delayed(predict_test_lgbm)(final_local_bayes_models[i], 
                                                                       i+1, 
                                                                       test_df_full, 
                                                                       lag_n) for i in range(4))

Local Models Bayes - Test Preds: 100%|████████████| 4/4 [00:00<00:00,  5.71it/s]


In [42]:
# Save the test set performance list into a df
test_results_bayes_df = pd.DataFrame(test_results_bayes)

In [43]:
# Add normalized metrics to the df
test_results_bayes_df['nrmse'] = test_results_bayes_df['rmse']/test_results_bayes_df['mean']
test_results_bayes_df['smae'] = test_results_bayes_df['mae']/test_results_bayes_df['mean']

In [44]:
# Print the means of the normalized metrics
test_results_bayes_df.mean()

mean     377.058594
mae       25.299451
rmse      37.795336
nrmse      0.120738
smae       0.081754
dtype: float64

In [45]:
# Create an empty dictionary to save residuals
bayes_mod_resiudal_dict = dict()

# Loop through the ts indexes. For each
for i in range(1, 5):
    # Create the y and X data frames
    y_train_sub = train_val_df_full.query("ts_index==@i").iloc[:,0:(lag_n+1)].dropna().iloc[:,0].copy()
    X_train_sub = train_val_df_full.query("ts_index==@i").iloc[:,0:(lag_n+1)].dropna().iloc[:,1:].copy()
    
    # Predict on the X data frame
    train_preds_sub = final_local_bayes_models[i-1].predict(X_train_sub)
    
    # Compute residuals and convery to list
    res_sub = (y_train_sub - train_preds_sub).to_list()
    
    # Add that list to the residual dictionary
    bayes_mod_resiudal_dict[i] = res_sub

In [46]:
# Create a dictionary for model performance
bayes_mod_test_preds = dict()

# Loop through the ts indexes
for i in range(1,5):
    # Create the X data frame to predict on 
    X_test_sub = test_df_full.query("ts_index==@i").iloc[:,1:(lag_n+1)]
    
    # Compute the test predictions
    test_pred_sub = final_local_bayes_models[i-1].predict(X_test_sub)
    
    # Save the predictions to the dictionary
    bayes_mod_test_preds[i] = test_pred_sub

In [47]:
# Define n_boot, the number of bootstramp samples to use for PI computation
n_boot = 1000

# Create a data frame to save PIs to
bayes_pred_int_df = pd.DataFrame()

# Set random seed for reproducibility
np.random.seed(54321)
random.seed(54321)

# Loop through the ts indexes in our list
for i in range(1,5):
    
    # Grab the true data y for that index
    y_test_sub = test_df_full.query("ts_index==@i").iloc[:,0]
    
    # Create empty lists to save PIs to
    percent_95_lo_ls = list()
    percent_95_hi_ls = list()
    percent_80_lo_ls = list()
    percent_80_hi_ls = list()
    
    for j in range(1344):
        # For each observation in the test set
        # Extract the predicted value
        pred = bayes_mod_test_preds[i][j]
        # Compute a bootstramp sample from the appropriate residual list
        resid_boot = np.random.choice(bayes_mod_resiudal_dict[i], size=n_boot, replace=True)
        # Add the prediction to the bootstrapped residual sample
        resid_preds = pred+resid_boot
        
        # From that, compute percentiles for the 95% and then 80% PIs and append those to lists
        percent_95_lo = np.percentile(resid_preds, 2.5)
        percent_95_hi = np.percentile(resid_preds, 97.5)
        percent_95_lo_ls.append(percent_95_lo)
        percent_95_hi_ls.append(percent_95_hi)
        
        percent_80_lo = np.percentile(resid_preds, 10)
        percent_80_hi = np.percentile(resid_preds, 90)
        percent_80_lo_ls.append(percent_80_lo)
        percent_80_hi_ls.append(percent_80_hi)
    
    # Create a temp data frame which includes the ts_index, true values, and PI for each observation
    pred_int_df_sub = pd.DataFrame({"ts_index": i,
                                    "actual": y_test_sub,
                                    "pct_95_lo": percent_95_lo_ls,
                                    "pct_95_hi": percent_95_hi_ls,                                   
                                    "pct_80_lo": percent_80_lo_ls,
                                    "pct_80_hi": percent_80_hi_ls
                                   })
    
    # Append that temp df to the full df
    bayes_pred_int_df = bayes_pred_int_df.append(pred_int_df_sub)

In [48]:
# Compute the PI scores for all observations for both 80% and 95% PIs
bayes_pred_int_df['int_80_score'] = interval_score(bayes_pred_int_df['actual'], 
                                                   bayes_pred_int_df['pct_80_lo'], 
                                                   bayes_pred_int_df['pct_80_hi'],
                                                   0.8
                                                  )

bayes_pred_int_df['int_95_score'] = interval_score(bayes_pred_int_df['actual'], 
                                                   bayes_pred_int_df['pct_95_lo'], 
                                                   bayes_pred_int_df['pct_95_hi'],
                                                   0.95
                                                  )

In [49]:
# Print the mean PI scores
bayes_pred_int_df.mean()

ts_index          2.500000
actual          377.058594
pct_95_lo       335.003617
pct_95_hi       424.157675
pct_80_lo       353.138723
pct_80_hi       405.628392
int_80_score    149.897297
int_95_score    313.767610
dtype: float64

In [50]:
# For each time series, compute the mean interval score and true mean
bayes_pred_int_df_grouped = bayes_pred_int_df.groupby("ts_index")\
.agg({"int_95_score":"mean", "int_80_score":"mean", "actual":"mean"}).reset_index()

# Use the true mean to compute the scaled interval score
bayes_pred_int_df_grouped['int_95_score_scaled'] = bayes_pred_int_df_grouped['int_95_score']/bayes_pred_int_df_grouped['actual']
bayes_pred_int_df_grouped['int_80_score_scaled'] = bayes_pred_int_df_grouped['int_80_score']/bayes_pred_int_df_grouped['actual']

In [51]:
# Print the per time series mean of the scaled interval score
bayes_pred_int_df_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.520346
int_95_score_scaled    1.294142
dtype: float64