In [1]:
# If the libraries are not yet installed, they can be installed in this notebook using commands similar to the below
# %conda install numpy
# %conda install pandas
# %conda install matplotlib
# %conda install scikit-learn
# %conda install -c conda-forge lightgbm 
# %conda install -c conda-forge swifter
# %conda install -c conda-forge bayesian-optimization 
# %conda install -c conda-forge scipy
# %conda install joblib
# %conda install tdqm

# Something like the following may also work if the above does not
# import sys
# !conda install --yes --prefix {sys.prefix} numpy
# !conda install --yes --prefix {sys.prefix} pandas
# !conda install --yes --prefix {sys.prefix} scikit-learn
# !conda install -c conda-forge --yes --prefix {sys.prefix} lightgbm
# !conda install -c conda-forge --yes --prefix {sys.prefix} swifter
# !conda install -c conda-forge --yes --prefix {sys.prefix} bayesian-optimization 
# !conda install -c conda-forge --yes --prefix {sys.prefix} scipy 
# !conda install --yes --prefix {sys.prefix} joblib
# !conda install --yes --prefix {sys.prefix} tdqm

# To install a specific version, add the version to the install command
# E.g., %conda install numpy=1.20.3

# If all else fails, use pip or follow additional advice such as found at
# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/

# If your plan to use pip (especially if you are not working within a specified conda environment), 
# the pip commands might look like:
# pip install numpy
# pip install pandas
# pip install scikit-learn
# pip install lightgbm
# pip install swifter
# pip install bayesian-optimization 
# pip install scipy
# pip install joblib
# pip install tdqm

# To install a specific version, add the version to the pip install command
# E.g., pip install numpy==1.20.3

In [2]:
import time
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import itertools
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import glob
from lightgbm import LGBMRegressor
import random
from sklearn.model_selection import ParameterSampler
import scipy
import gc
from joblib import Parallel, delayed
import contextlib
import joblib
from tqdm import tqdm
from bayes_opt import BayesianOptimization
import swifter
import os

In [3]:
# Set seed for reproducibility
np.random.seed(54321)
random.seed(54321)

In [4]:
# Create directories to save results

if not os.path.exists("Results/Global/LightGBM Bayes/"):
    os.mkdir("Results/Global/LightGBM Bayes")
      
if not os.path.exists("Results/Global/LightGBM Bayes/Full"):
    os.mkdir("Results/Global/LightGBM Bayes/Full")
    
if not os.path.exists("Results/Global/LightGBM Bayes/Random Cluster"):
    os.mkdir("Results/Global/LightGBM Bayes/Random Cluster")
    
if not os.path.exists("Results/Global/LightGBM Bayes/Highway System"):
    os.mkdir("Results/Global/LightGBM Bayes/Highway System")
    
if not os.path.exists("Results/Global/LightGBM Bayes/Catch22 KMeans"):
    os.mkdir("Results/Global/LightGBM Bayes/Catch22 KMeans")

if not os.path.exists("Results/Global/LightGBM Bayes/TSFeat KMeans"):
    os.mkdir("Results/Global/LightGBM Bayes/TSFeat KMeans")

if not os.path.exists("Results/Global/LightGBM Bayes/DTW"):
    os.mkdir("Results/Global/LightGBM Bayes/DTW")

# Read in Data and Prepare for Modeling

In [5]:
# Create an empty list to hold the dataframes of highways england data
england_df_list = list()

# Loop through the files, sorted in alphabetical order
# Read them into a df, make sure they are sorted by timestamp, and append to the list
for fname in sorted(glob.glob("Data/Processed/Highways_England/*.csv")):
    print("Reading {}".format(fname))
    df = pd.read_csv(fname) #, parse_dates=['timestamp'], index_col=['timestamp'])
    df = df.sort_values(by="timestamp")
    england_df_list.append(df)

Reading Data/Processed/Highways_England/A11-6310-1_Southbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A11-6312-2_Northbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A14-1107A_Eastbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A14-1144B_Westbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A1M-9842B_Southbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A1M-9847a_Northbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A46-7636-1_Northbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A46-7636-2_Southbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A47-6337-1_Westbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A47-6337-2_Eastbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A5-6847-2_Southbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A5-7572-1-Northbound_2019_Processed.csv
Reading Data/Processed/Highways_Englan

In [6]:
# Follow the same process in this cell and the next as was done above, just for other highway systems
portland_df_list = list()

for fname in sorted(glob.glob("Data/Processed/Portland/*.csv")):
    print("Reading {}".format(fname))
    df = pd.read_csv(fname) #, parse_dates=['timestamp'], index_col=['timestamp'])
    df = df.sort_values(by="timestamp")
    portland_df_list.append(df)

Reading Data/Processed/Portland/I205-101068_Northbound_2019_Processed.csv
Reading Data/Processed/Portland/I205-101073_Southbound_2019_Processed.csv
Reading Data/Processed/Portland/I405-100395_Southbound_2019_Processed.csv
Reading Data/Processed/Portland/I405-100527_Northbound_2019_Processed.csv
Reading Data/Processed/Portland/I5-100688_Southbound_2019_Processed.csv
Reading Data/Processed/Portland/I5-100703_Northbound_2019_Processed.csv
Reading Data/Processed/Portland/I84-101108_Eastbound_2019_Processed.csv
Reading Data/Processed/Portland/I84-101161_Westbound_2019_Processed.csv
Reading Data/Processed/Portland/OR217-100300_Southbound_2019_Processed.csv
Reading Data/Processed/Portland/OR217-100314_Northbound_2019_Processed.csv
Reading Data/Processed/Portland/R2 Delta Hwy-101745_Northbound_2019_Processed.csv
Reading Data/Processed/Portland/R2 OR18-102111_Westbound_2019_Processed.csv
Reading Data/Processed/Portland/R2 OR18-102113_Eastbound_2019_Processed.csv
Reading Data/Processed/Portland/

In [7]:
utah_df_list = list()

for fname in sorted(glob.glob("Data/Processed/Utah/*.csv")):
    print("Reading {}".format(fname))
    df = pd.read_csv(fname) #, parse_dates=['timestamp'], index_col=['timestamp'])
    df = df.sort_values(by="timestamp")
    utah_df_list.append(df)

Reading Data/Processed/Utah/I15-3103178_Southbound_2019_Processed.csv
Reading Data/Processed/Utah/I15-749_Northbound_2019_Processed.csv
Reading Data/Processed/Utah/I215-134_Counterclockwise_2019_Processed.csv
Reading Data/Processed/Utah/I215-31_Clockwise_2019_Processed.csv
Reading Data/Processed/Utah/I70-3103400_Westbound_2019_Processed.csv
Reading Data/Processed/Utah/I70-3103401_Eastbound_2019_Processed.csv
Reading Data/Processed/Utah/I80-600_Eastbound_2019_Processed.csv
Reading Data/Processed/Utah/I80-667_Westbound_2019_Processed.csv
Reading Data/Processed/Utah/I84-451_Eastbound_2019_Processed.csv
Reading Data/Processed/Utah/I84-482_Westbound_2019_Processed.csv
Reading Data/Processed/Utah/LegacyParkway-810_Northbound_2019_Processed.csv
Reading Data/Processed/Utah/LegacyParkway-890_Southbound_2019_Processed.csv
Reading Data/Processed/Utah/US189-260_Westbound_2019_Processed.csv
Reading Data/Processed/Utah/US189-470_Eastbound_2019_Processed.csv
Reading Data/Processed/Utah/US40-634_Westb

In [8]:
# Append all df lists together into one
total_df_list = england_df_list + portland_df_list + utah_df_list

In [9]:
# Read in the start and end points csv, and subtract 1 to deal with index differences between R and python
start_end = pd.read_csv("start_end_points.csv")
start_end["start"] = start_end["start"] - 1
start_end["end"] = start_end["end"]

In [10]:
# Create an empty list to hold the subset data frames (those with only 12 weeks of data per highway)
subset_df_list = list()

In [11]:
# For each df in our original total df list
for idx, df in enumerate(total_df_list):
        
    # Filter the timeframe based on the start_end_points csv files
    subset_df = df.iloc[start_end.iloc[idx,0]:start_end.iloc[idx,1], ]\
    .reset_index(drop=True).reset_index(drop=False)\
    .rename(columns={"index":"rn"})
    
    # Create a new field called train_val_test to differentiate each set of data
    subset_df["train_val_test"] = np.where(subset_df["rn"]<(96*7*8),
                                           "train",
                                           np.where(subset_df["rn"]<(96*7*10),
                                                    "val",
                                                    "test"
                                                   )
                                       )
    
    # Append to list
    subset_df_list.append(subset_df)

In [12]:
# Create a list of df's with only fields we need

# Initialize empty list
model_df_list = list()

# For df in subset list
for df in subset_df_list:
       
    # Extract the timestamp, the volume, and the train_val_test assignment
    model_df = df[['timestamp', 'total_volume', "train_val_test"]]\
    .rename(columns={'timestamp':'start', 'total_volume':'target'})
    
    # Append this df to the new list
    model_df_list.append(model_df)

# Helper Function

In [13]:
# Code for progress bar:
# https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution
# This allows us to print a progress bar while running parallel loops using joblib 

@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

## Create Lag Emebedded Matrices for each TS

In [14]:
# Set the final lag value to be used for all lag embedding
lag_n = 840

In [15]:
# Create an empty list to save lag embedded data into
lag_embed_df_list = list()

# For each data frame
for df in model_df_list:
    for n in range(1, (lag_n+1)):
        # For each lag level, up to lag_n + 1 (we add 1 to preserve the target value correctly)
        # Create a new column called target-n
        name = f"target-{n}"
        # Save the target shifted n values into this column
        df[name] = df['target'].shift(n)
    # Append the lag embedded df to the list
    lag_embed_df_list.append(df)

  df[name] = df['target'].shift(n)


In [16]:
# Split the lag embedded list into train, val, and test lists

# First, initialize empty lists for each train, val, and test
train_df_list = list()
val_df_list = list()
test_df_list = list()

# For each df in our list
for i in range(len(lag_embed_df_list)):
    
    # Create a copy of just the data frame of interest
    df = lag_embed_df_list[i].copy()
    # Add a field to it for ts_index, this is for joining with cluster data later and is equal to i+1 due to 
    # differences in indexing between R and Python
    df['ts_index'] = i + 1
    
    # Subset into train, val, and test df's based on the train_val_test_field
    train_df = df.query("train_val_test == 'train'").copy()
    val_df = df.query("train_val_test=='val'").copy()
    test_df = df.query("train_val_test=='test'").copy()
   
    # Append to appropriate lists
    train_df_list.append(train_df)
    val_df_list.append(val_df)
    test_df_list.append(test_df)

In [17]:
# Concat all dfs from the lists together to create one full train, val, and test df
train_df_full = pd.concat(train_df_list)
val_df_full = pd.concat(val_df_list)
test_df_full = pd.concat(test_df_list)

In [18]:
# Drop unneeded columns
train_df_full.drop(columns=['start', 'train_val_test'], inplace=True)
val_df_full.drop(columns=['start', 'train_val_test'], inplace=True)
test_df_full.drop(columns=['start', 'train_val_test'], inplace=True)

In [19]:
# Append the training and validation data together for later use
train_val_df_full = train_df_full.append(val_df_full)

In [20]:
# Delete unused variables to free up memory
del train_df_list
del val_df_list 
del test_df_list
del lag_embed_df_list
del model_df_list
del subset_df_list
del total_df_list
del england_df_list
del portland_df_list
del utah_df_list

In [21]:
# Garbage collect
gc.collect()

0

# Full Data Set

In [22]:
# Create X and y training and validation data frames
# y is always the first column of the data frame, and X is the remaining columns up to lag_n+1
# For train, we use dropna to ensure that the first lag_n row, which have null values in them,
# are not included in the training data. This is not necessary for validation as there are no null values
X_train_full = train_df_full.iloc[:,0:(lag_n+1)].dropna().iloc[:,1:]
y_train_full = train_df_full.iloc[:,0:(lag_n+1)].dropna().iloc[:,0]

X_val_full = val_df_full.iloc[:,1:(lag_n+1)]
y_val_full = val_df_full.iloc[:,0]

In [23]:
# Define a function to optimize a light gbm model using Bayesian optimization

def optimize_lgbm_w_bayes(X_train, y_train, X_val, y_val):
    """Function takes in four inputs: the training and validation X and y data frames
    and returns the model params found by the Bayesian optimizer to have the best performance"""
    
    # Set the X_train, y_train, X_val, and y_val variables inside the function
    X_train = X_train
    y_train = y_train
    
    X_val = X_val
    y_val = y_val
    
    # Set up the min and max of the parameter space to explore for each parameter
    bayes_param_ss = {
    "n_estimators": (100, 1000),
    "max_depth": (2, 25),
    "lambda_l1": (0, 1),
    "lambda_l2": (0, 1),
    "num_leaves": (10, 150),
    "colsample_bytree": (0.1, 1),
    "learning_rate": (0.00001, 0.5)
    }
    

    # Define a function to compute validation set predictions
    def val_predict(model, X_val, y_val):
        """Function which takes a trained model and X and y for validation set 
        and returns the scaled rmse for the validation set predictions"""
        
        # Compute the mean of the target values
        val_mean = np.mean(y_val)
        
        # Compute predictions with the validation X data frame
        val_preds = model.predict(X_val)
        
        # Compute validation rmse and scaled rmse by dividing by the mean
        val_rmse = mean_squared_error(y_val, val_preds, squared=False)
        val_nrmse = val_rmse/val_mean
            
        # Return scaled rmse
        return val_nrmse
    
    
    # Define a function to perform the Bayesian optimization
    def lgbm_eval_for_bayes(n_estimators,
                        max_depth,
                        lambda_l1, 
                        lambda_l2,
                        num_leaves,
                        colsample_bytree,
                        learning_rate
                       ):
    
        """Function which takes in parameter values as inputs and returns a value to be maximized by the
        Bayesian optimizer. In this case, we return -1*validation_nrmse as this allows us to minimize the
        validation nrmse"""
        
        # Set the proper boosting type
        params = {"boosting_type": "goss"
                 }

        # Set the params dictionary to include all input params
        # For n_estimators, max_depth, and num_leaves, round and cast as int - this is what the lgbm model requires
        params["n_estimators"] = int(round(n_estimators))
        params["max_depth"] = int(round(max_depth))
        params["reg_alpha"] = max(lambda_l1, 0)
        params["reg_lambda"] = max(lambda_l2, 0)
        params["num_leaves"] = int(round(num_leaves))
        params["colsample_bytree"] = colsample_bytree
        params["learning_rate"] = learning_rate

        # Create the model given these params
        mod = LGBMRegressor(**params, random_state=54321)  
        # Fit the model to the X and y training data defined earlier in the overall function
        mod.fit(X_train, y_train)

        # Compute validation performance using the data passed to the main function and the previously
        # defined function to compute val performance. Note that we multiply by -1 here as the optimizer
        # is expecting a value to be maximized, not minimized
        val_perf = -1*np.mean(val_predict(mod, X_val, y_val))

        # Return the negative validation nrmse
        return val_perf

    # Create an optimizer object    
    optimizer = BayesianOptimization(lgbm_eval_for_bayes,
                                     bayes_param_ss,
                                     random_state=54321)
    # Maximize the optimizer with 5 random initialization points and 25 further iterations
    optimizer.maximize(init_points=5, n_iter=25)
    
    # Return the best param set found by the optimizer
    return optimizer.max['params']

In [24]:
# Call the optimizer defined above
bayes_full_model = optimize_lgbm_w_bayes(X_train_full,
                                         y_train_full,
                                         X_val_full,
                                         y_val_full
                                        )

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1318  [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [0m 2       [0m | [0m-0.1319  [0m | [0m 0.7999  [0m | [0m 0.4889  [0m | [0m 0.05053 [0m | [0m 0.2692  [0m | [0m 2.954   [0m | [0m 827.3   [0m | [0m 149.3   [0m |
| [0m 3       [0m | [0m-0.1428  [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.1558  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.1387  [0m | [0m 0.746

In [25]:
# Inpsect the params found by the optimizer
bayes_full_model

{'colsample_bytree': 0.5726978820897191,
 'lambda_l1': 0.514735973118135,
 'lambda_l2': 0.34808264544051104,
 'learning_rate': 0.017905362936323412,
 'max_depth': 6.203116045410376,
 'n_estimators': 946.4600324511604,
 'num_leaves': 100.383333212035}

In [26]:
# Round and cast to int the model params which must be integers
bayes_full_model['max_depth'] = int(round(bayes_full_model['max_depth']))
bayes_full_model['n_estimators'] = int(round(bayes_full_model['n_estimators']))
bayes_full_model['num_leaves'] = int(round(bayes_full_model['num_leaves']))

In [27]:
# Create a model using the params found by the optimizer
lgbm_full_model_bayes = LGBMRegressor(boosting_type="goss", **bayes_full_model, random_state=54321)  

In [28]:
# Create X and y for the training and validation data together to fit the final model to this full set
X_train_val_full = train_val_df_full.iloc[:,0:(lag_n+1)].dropna().iloc[:,1:]
y_train_val_full = train_val_df_full.iloc[:,0:(lag_n+1)].dropna().iloc[:,0]

In [29]:
# Fit the model
lgbm_full_model_bayes.fit(X_train_val_full, y_train_val_full)



LGBMRegressor(boosting_type='goss', colsample_bytree=0.5726978820897191,
              lambda_l1=0.514735973118135, lambda_l2=0.34808264544051104,
              learning_rate=0.017905362936323412, max_depth=6, n_estimators=946,
              num_leaves=100, random_state=54321)

In [30]:
# Save model to file to use later
filename = 'Results/Global/LightGBM Bayes/Full/model'
joblib.dump(lgbm_full_model_bayes, filename)

['Results/Global/LightGBM Bayes/Full/model']

In [31]:
# Load the model from file using joblib.load
lgbm_full_model_bayes = joblib.load("Results/Global/LightGBM Bayes/Full/model")

In [32]:
# Define a function to compute model residuals
def compute_lgbm_residuals(mod, X, y):
    """Function takes in a trained model and X and y on which the model was trained, 
    and compute residuals. Residuals are returned as a list"""
    
    # Compute model predicitons from the provided X
    pred = mod.predict(X)
    
    # Compute residuals as y - predictions, and convert to list
    resid = (y - pred).to_list()
    
    # Return list of residuals
    return resid

In [33]:
# Compute model residuals using above function
lgbm_full_model_bayes_residuals = compute_lgbm_residuals(lgbm_full_model_bayes, 
                                                         X_train_val_full,
                                                         y_train_val_full
                                                        )

In [34]:
full_mod_resid_df = pd.DataFrame({"residual": lgbm_full_model_bayes_residuals})

In [35]:
full_mod_resid_df.to_csv("Results/Global/LightGBM Bayes/Full/residual.csv", index=False)

In [36]:
# Function to compute test preds
def compute_lgbm_test_preds(mod, data, lag_n):
    """Function takes in a trained model, test data frame, and lag_n used for lag embedding, and
    returns a data frame of predictions for the provided data"""

    # Create an empty data frame to store predictions in
    pred_df = pd.DataFrame()
    
    # Loop through each time series index in the data set
    for ts_idx in data.ts_index.unique():
        # For each time series index, grab X by eliminating the first column and any columns past (lag_n+1)
        X = data.query("ts_index==@ts_idx").iloc[:,1:(lag_n+1)].copy()
        # Compute model preds from X
        preds = mod.predict(X)
        
        # Save the preds, along with the time series index, to a temp data frame
        pred_df_sub = pd.DataFrame({"ts_index": ts_idx, "test_preds": preds})
        
        # Append the temp df to the full df
        pred_df = pred_df.append(pred_df_sub)
    
    # Return the full data frame of test set predictions
    return pred_df

In [37]:
# Compute test set predictions using the above function
lgbm_full_model_bayes_test_preds = compute_lgbm_test_preds(lgbm_full_model_bayes,
                                                           test_df_full,
                                                           lag_n
                                                          )

In [38]:
# Function to compute test prediction performance metrics
def compute_lgbm_test_perf(preds, data):
    """Function which takes in a data frame of predictions and a test data frame and computes model performance"""
    
    # Create an empty list to store performance data
    perf_ls = list()
    
    # Loop through the time series indexes in our data
    for ts_idx in data.ts_index.unique():
        # For each time series index
        # Extract the true target value (first column of the data frame)
        y_sub = data.query("ts_index==@ts_idx").iloc[:,0]
        # Extract the preds for that ts_idx
        preds_sub = preds.query("ts_index==@ts_idx").test_preds
        
        # Compute rmse, mae, and the mean of the true target data using numpy and sklearn functions
        rmse_sub = mean_squared_error(y_sub, preds_sub, squared=False)
        mae_sub = mean_absolute_error(y_sub, preds_sub)
        mean_sub = np.mean(y_sub)
        
        # Create a dictionary to hold these metrics
        pred_dict = {"rmse": rmse_sub, "mae": mae_sub, "mean": mean_sub}
        
        # Append this dictionary to the list
        perf_ls.append(pred_dict)
        
    # Call pd.DataFrame on the list of performance dictionaries to create a df of performance and then return it
    return pd.DataFrame(perf_ls)

In [39]:
# Compute test set performance
lgbm_full_model_bayes_test_perf_df = compute_lgbm_test_perf(lgbm_full_model_bayes_test_preds, test_df_full)

In [40]:
# Compute normalized/scaled performance metrics as well
lgbm_full_model_bayes_test_perf_df['nrmse'] = lgbm_full_model_bayes_test_perf_df['rmse']/lgbm_full_model_bayes_test_perf_df['mean']
lgbm_full_model_bayes_test_perf_df['smae'] = lgbm_full_model_bayes_test_perf_df['mae']/lgbm_full_model_bayes_test_perf_df['mean']

In [41]:
# Print the means of the performance metrics
lgbm_full_model_bayes_test_perf_df.mean()

rmse      30.603133
mae       20.185643
mean     265.435072
nrmse      0.138456
smae       0.093232
dtype: float64

In [42]:
# Function to compute bootstrap pred intervals
def compute_lgbm_boostrap_int(preds, resid, n_boot):
    """Function takes in three inputs: a data frame of predictions, a list of residuals, and the number of 
    bootstrap resamples to use, n_boot. Function returns a modified version of the preds data frame which includes
    both 80% and 95% PIs"""
    
    # Set seeds
    random.seed(54321)
    np.random.seed(54321)
       
    resid = resid
    n_boot = n_boot
    
    # Define sub function to compute samples
    def percentile_sample(row):
        """Function to boostramp sample residuals, add to predicted value, and compute percentiles for PIs.
        Function is written to specifically operate on the rows of the preds data frame"""
        
        # Bootstrap sample from the residuals
        boot_samp = np.random.choice(resid, size=n_boot, replace=True)

        # Add the predicted value to the bootstrap samples
        new_val = row['test_preds']+boot_samp

        # Compute percentiles of the samples for the 95% and then 80% PIs
        lo_95 = np.percentile(new_val, 2.5)
        hi_95 = np.percentile(new_val, 97.5)
        lo_80 = np.percentile(new_val, 10)
        hi_80 = np.percentile(new_val, 90)

        # Return a tuple of the percentiles which can be assigned to new data frame columns
        return lo_95,hi_95,lo_80,hi_80

    # Reset the index of the preds df so that swifter apply will work properly
    preds = preds.reset_index(drop=True)
    
    # Compute bootstrap PIs using the above sub function and assign to new df columns
    preds['lo_95'], preds['hi_95'], preds['lo_80'], preds['hi_80'] = zip(*preds.swifter.apply(percentile_sample, axis=1))
    
    # Return the modified preds data frame
    return preds

In [43]:
# Set n_boot to 1000 
n_boot = 1000

In [44]:
# Compute the prediction inntervals
lgbm_full_model_bayes_test_pred_int = compute_lgbm_boostrap_int(lgbm_full_model_bayes_test_preds,
                                                                lgbm_full_model_bayes_residuals,
                                                                n_boot)

Pandas Apply:   0%|          | 0/102144 [00:00<?, ?it/s]

In [45]:
# Sanity check shape of output
lgbm_full_model_bayes_test_pred_int.shape

(102144, 6)

In [46]:
# Add the true target values as a column to the PI data frame
lgbm_full_model_bayes_test_pred_int['actual'] = test_df_full.iloc[:,0].to_list()

In [47]:
# Print head to sanity check
lgbm_full_model_bayes_test_pred_int.head()

Unnamed: 0,ts_index,test_preds,lo_95,hi_95,lo_80,hi_80,actual
0,1,329.603418,276.491873,387.566262,301.935275,358.900685,320.0
1,1,327.388077,266.698468,396.614923,296.641381,357.60619,339.0
2,1,341.182653,280.88633,397.902035,312.706347,367.273697,349.0
3,1,351.206502,285.292677,416.952564,322.111895,381.659577,343.0
4,1,342.69946,282.471138,400.051979,314.355361,372.743081,343.0


In [48]:
lgbm_full_model_bayes_test_pred_int.ts_index.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       69, 70, 71, 72, 73, 74, 75, 76])

In [49]:
# Create a function to compute the interval score
def interval_score(true_values, lower, upper, interval_range):
    """ Function which takes in the true values, the upper and lower bounds of PIs, and the PI level (e.g., 90%)
        and from these inputs, computes the interval score for each prediction
    """
    
    # Compute alpha from the interval range
    alpha = 1-interval_range
    
    # Save the upper, lower, and true_values as numpy arrays for computation purposes
    upper = np.array(upper)
    lower = np.array(lower)
    true_values = np.array(true_values)
    
    # Compute the lower component of the interval score - just a boolean for true below interval
    def lower_ind(true,low):
        if true<low:
            return 1
        else:
            return 0
        
    # Computer the upper component of the interval score - similar boolean for true above interval
    def upper_ind(true,up):
        if true>up:
            return 1
        else:
            return 0
        
    # Computer the actual score for each obsveration - formula here: https://epiforecasts.io/scoringutils/reference/interval_score.html
    scores = (upper-lower) + (2/alpha)*(lower-true_values)*(lower > true_values) + (2/alpha)*(true_values-upper)*(true_values > upper)
    
    # Return the scores array
    return scores

In [50]:
# Compute the 80% and 95% PI scores for each prediction
lgbm_full_model_bayes_test_pred_int['int_95_score'] = interval_score(lgbm_full_model_bayes_test_pred_int.actual, 
                                                                     lgbm_full_model_bayes_test_pred_int.lo_95,
                                                                     lgbm_full_model_bayes_test_pred_int.hi_95,
                                                                     0.95)
                                                    
lgbm_full_model_bayes_test_pred_int['int_80_score'] = interval_score(lgbm_full_model_bayes_test_pred_int.actual, 
                                                                     lgbm_full_model_bayes_test_pred_int.lo_80,
                                                                     lgbm_full_model_bayes_test_pred_int.hi_80,
                                                                     0.80)

In [51]:
# Print the mean PI scores
lgbm_full_model_bayes_test_pred_int.mean()

ts_index         38.500000
test_preds      265.750378
lo_95           205.259919
hi_95           328.566850
lo_80           237.130444
hi_80           295.344232
actual          265.435072
int_95_score    225.436875
int_80_score    121.707257
dtype: float64

In [52]:
lgbm_full_model_bayes_test_pred_int_grouped = lgbm_full_model_bayes_test_pred_int.groupby("ts_index")\
.agg({"int_95_score":"mean", "int_80_score":"mean", "actual":"mean"}).reset_index()

lgbm_full_model_bayes_test_pred_int_grouped['int_95_score_scaled'] = lgbm_full_model_bayes_test_pred_int_grouped['int_95_score']/lgbm_full_model_bayes_test_pred_int_grouped['actual']
lgbm_full_model_bayes_test_pred_int_grouped['int_80_score_scaled'] = lgbm_full_model_bayes_test_pred_int_grouped['int_80_score']/lgbm_full_model_bayes_test_pred_int_grouped['actual']

In [53]:
lgbm_full_model_bayes_test_pred_int_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.604982
int_95_score_scaled    1.122800
dtype: float64

In [54]:
# Save the PI data frame to a csv file
lgbm_full_model_bayes_test_pred_int.to_csv("Results/Global/LightGBM Bayes/Full/test_pred_intervals.csv", index=False)

# Train and Test - Random Clusters

In [55]:
# Delete variables no longer in use
del lgbm_full_model_bayes_test_pred_int
del lgbm_full_model_bayes_test_perf_df
del lgbm_full_model_bayes_test_preds
del lgbm_full_model_bayes_residuals
del lgbm_full_model_bayes
del X_train_val_full
del y_train_val_full
del X_val_full
del y_val_full
del X_train_full
del y_train_full

In [56]:
# Garbage collect
gc.collect()

143

In [57]:
# Read in cluster data for random clusters, and rename assignments to 'cluster'
rand_clust = pd.read_csv("Results/Clustering/Random/random_clustering_assign.csv")
rand_clust['cluster'] = rand_clust['random_clust_assign']

In [58]:
# Merge train and val data frames with cluster assignments
train_df_rand_clust = train_df_full.merge(rand_clust, on="ts_index")
val_df_rand_clust = val_df_full.merge(rand_clust, on="ts_index")

In [59]:
# Create a list of data frames which only contain data for each cluster. Do this for both
# training and validation data
train_df_rand_clust_ls = [df.reset_index(drop=True) for _,df in train_df_rand_clust.groupby("cluster")]
val_df_rand_clust_ls = [df.reset_index(drop=True) for _,df in val_df_rand_clust.groupby("cluster")]

In [60]:
# Loop through the list of training and validation data frames in a parallel fashion and run the Bayesian
# optimization function for each cluster's data in parallel
# Save the best params for each cluster to a list
# Note that in the function call, we are subsetting the data frames to X and y data frames instead of doing
# this beforehand like was done with the full model above
with tqdm_joblib(tqdm(desc="Random Cluster LGBM Models Bayes", 
                      total=len(train_df_rand_clust_ls))) as progress_bar:
    rand_clust_mods_bayes = Parallel(n_jobs=4)(delayed(optimize_lgbm_w_bayes)(train_df_rand_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:], 
                                                                              train_df_rand_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0],  
                                                                              val_df_rand_clust_ls[i].iloc[:,1:(lag_n+1)],
                                                                              val_df_rand_clust_ls[i].iloc[:,0]) for i in range(len(train_df_rand_clust_ls)))

Random Cluster LGBM Models Bayes:  50%|█████     | 2/4 [19:55<17:41, 530.82s/it]

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1505  [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [0m 2       [0m | [0m-0.1518  [0m | [0m 0.7999  [0m | [0m 0.4889  [0m | [0m 0.05053 [0m | [0m 0.2692  [0m | [0m 2.954   [0m | [0m 827.3   [0m | [0m 149.3   [0m |
| [0m 3       [0m | [0m-0.1711  [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.1966  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.1662  [0m | [0m 0.746

Random Cluster LGBM Models Bayes: 100%|██████████| 4/4 [22:35<00:00, 338.80s/it]


In [61]:
# For each entry in the list of params returned above, round and cast the params which 
# LGBM models require to be integers
for n in range(len(rand_clust_mods_bayes)):
    rand_clust_mods_bayes[n]["max_depth"] = int(round(rand_clust_mods_bayes[n]["max_depth"]))
    rand_clust_mods_bayes[n]["n_estimators"] = int(round(rand_clust_mods_bayes[n]["n_estimators"]))
    rand_clust_mods_bayes[n]["num_leaves"] = int(round(rand_clust_mods_bayes[n]["num_leaves"]))

In [62]:
# Merge the train_val data frame with cluster assignments
train_val_df_rand = train_val_df_full.merge(rand_clust, on="ts_index")
# Create a list of smaller data frames which contain data each from one cluster
train_val_df_rand_ls = [df.reset_index(drop=True) for _,df in train_val_df_rand.groupby("cluster")]

In [63]:
# Function to train a light gbm model
def train_lgbm(params, X, y):
    """Function takes in a set of params, X, and y data frames for training and returns a trained model"""
    
    # Create the model, using the passed params, a fixed random state, and a 'goss' boosting type
    mod = LGBMRegressor(boosting_type='goss', **params, random_state=54321)  
    # Fir the model to the provided data
    mod.fit(X, y)
    
    # Return the fitted model
    return mod

In [64]:
# For each set of model params found above, loop through the list of full train_val data and train a model
# Again, this is done in parallel with the models saved to a list, and again the X and y data frames are created
# in the function call as opposed to before
with tqdm_joblib(tqdm(desc="Random Cluster LGBM Models Bayes Final", 
                      total=len(train_val_df_rand_ls))) as progress_bar:
    rand_clust_mods_bayes_final = Parallel(n_jobs=4)(delayed(train_lgbm)(rand_clust_mods_bayes[i], 
                                                                         train_val_df_rand_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:], 
                                                                         train_val_df_rand_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0]
                                                                        ) for i in range(len(train_val_df_rand_ls)))

Random Cluster LGBM Models Bayes Final:  25%|█▎   | 1/4 [00:50<02:31, 50.66s/it]

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1224  [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [0m 2       [0m | [0m-0.1233  [0m | [0m 0.7999  [0m | [0m 0.4889  [0m | [0m 0.05053 [0m | [0m 0.2692  [0m | [0m 2.954   [0m | [0m 827.3   [0m | [0m 149.3   [0m |
| [0m 3       [0m | [0m-0.1396  [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.1518  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.1324  [0m | [0m 0.746

Random Cluster LGBM Models Bayes Final: 100%|█████| 4/4 [01:24<00:00, 21.00s/it]


In [65]:
# For each of the models trained above, save them to a file
for model_no in range(len(rand_clust_mods_bayes_final)):
    fname = f"Results/Global/LightGBM Bayes/Random Cluster/model_{model_no}"
    joblib.dump(rand_clust_mods_bayes_final[model_no], fname)

In [66]:
rand_clust_mods_bayes_final = list()

# For each of the models trained above, save them to a file
for model_no in range(len(train_val_df_rand_ls)):
    fname = f"Results/Global/LightGBM Bayes/Random Cluster/model_{model_no}"
    rand_clust_mods_bayes_final.append(joblib.load(fname))

In [67]:
# For each of the above models, compute the residuals. Loop, in parallel, through the list of models,
# create the X and y data frames the model was trained on, and return a list of residuals. These lists of 
# residuals are saved in a list
with tqdm_joblib(tqdm(desc="Random Cluster LGBM Models Bayes Residuals", 
                      total=len(rand_clust_mods_bayes_final))) as progress_bar:
    rand_clust_mods_bayes_resid = Parallel(n_jobs=4)(delayed(compute_lgbm_residuals)(rand_clust_mods_bayes_final[i],
                                                                                     train_val_df_rand_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                                     train_val_df_rand_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0]) for i in range(len(rand_clust_mods_bayes_final)))

Random Cluster LGBM Models Bayes Residuals: 100%|█| 4/4 [00:11<00:00,  2.96s/it]


In [68]:
len(rand_clust_mods_bayes_resid[3])

111720

In [69]:
rand_clust_res_df = pd.DataFrame({'cluster': list({(i+1): rand_clust_mods_bayes_resid[i] for i in range(len(rand_clust_mods_bayes_resid))}.keys()),
                                  'residual': list({(i+1): rand_clust_mods_bayes_resid[i] for i in range(len(rand_clust_mods_bayes_resid))}.values())})

rand_clust_res_df.head()

Unnamed: 0,cluster,residual
0,1,"[4.236943301123034, 9.797984935874439, 3.33155..."
1,2,"[-19.96974756315297, 2.609522412481862, 8.9609..."
2,3,"[-12.854841077574974, -22.138411981257377, -36..."
3,4,"[-6.423664164266484, 3.396377552591389, -51.33..."


In [70]:
rand_clust_res_df.to_csv("Results/Global/LightGBM Bayes/Random Cluster/residual.csv", index=False)

In [71]:
# Merge the test data with the cluster assignments
test_df_full_rand = test_df_full.merge(rand_clust, on="ts_index")
# Split the test data frame into a list of data frames, each with data from one cluster
test_df_full_rand_ls = [df.reset_index(drop=True) for _,df in test_df_full_rand.groupby("cluster")]

In [72]:
# For each model, loop in parallel, compute the test preds as a data frame and save those data frames to a list
with tqdm_joblib(tqdm(desc="Random Cluster LGBM Models Bayes Test Preds", 
                      total=len(rand_clust_mods_bayes_final))) as progress_bar:
    rand_clust_mods_bayes_test_preds = Parallel(n_jobs=4)(delayed(compute_lgbm_test_preds)(rand_clust_mods_bayes_final[i],
                                                                                           test_df_full_rand_ls[i],
                                                                                           lag_n
                                                                                          ) for i in range(len(rand_clust_mods_bayes_final)))

Random Cluster LGBM Models Bayes Test Preds: 100%|█| 4/4 [00:02<00:00,  1.47it/s


In [73]:
# Save all the above created data frames of test preds into one data frame
rand_clust_bayes_test_preds_df = pd.concat(rand_clust_mods_bayes_test_preds)

In [74]:
# Using this one data frame of test preds, compute prediction performance
rand_clust_bayes_test_perf = compute_lgbm_test_perf(rand_clust_bayes_test_preds_df,
                                                    test_df_full_rand)

In [75]:
# Add scaled performance metrics to the data frame
rand_clust_bayes_test_perf['nrmse'] = rand_clust_bayes_test_perf['rmse']/rand_clust_bayes_test_perf['mean']
rand_clust_bayes_test_perf['smae'] = rand_clust_bayes_test_perf['mae']/rand_clust_bayes_test_perf['mean']

In [76]:
# Print the means of prediction performance metrics
rand_clust_bayes_test_perf.mean()

rmse      31.254831
mae       20.767924
mean     265.435072
nrmse      0.142022
smae       0.096263
dtype: float64

In [77]:
# Create an empty list to save PI data frames
rand_clust_test_pred_int = list()
# Loop through the list of prediction data frames
for i in range(len(rand_clust_mods_bayes_test_preds)):
    # For each one, compute bootstrap PIs and save that data frame to the above list
    rand_clust_test_pred_int.append(compute_lgbm_boostrap_int(rand_clust_mods_bayes_test_preds[i], 
                                                              rand_clust_mods_bayes_resid[i], 
                                                              n_boot))

Pandas Apply:   0%|          | 0/25536 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/25536 [00:00<?, ?it/s]

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1444  [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [0m 2       [0m | [0m-0.1475  [0m | [0m 0.7999  [0m | [0m 0.4889  [0m | [0m 0.05053 [0m | [0m 0.2692  [0m | [0m 2.954   [0m | [0m 827.3   [0m | [0m 149.3   [0m |
| [0m 3       [0m | [0m-0.1705  [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.1831  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.1584  [0m | [0m 0.746



Pandas Apply:   0%|          | 0/25536 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/25536 [00:00<?, ?it/s]

In [78]:
# For each cluster
for n in range(1, len(rand_clust_test_pred_int)+1):
    # Get the true values for the target for that cluster
    y_actual_sub = test_df_full_rand.query("cluster==@n").copy().iloc[:,0].to_list()
    # Add those true values as a column to that cluster's PI data frame
    rand_clust_test_pred_int[n-1]['actual'] = y_actual_sub

In [79]:
# Append all prediction interval data frames from each cluster into one data frame
rand_clust_test_pred_int_df = pd.concat(rand_clust_test_pred_int)
# for clust_test_pred_int_df in rand_clust_test_pred_int:
#     rand_clust_test_pred_int_df = rand_clust_test_pred_int_df.append(clust_test_pred_int_df)

In [80]:
# For that one data frame, add columns which compute the 95% and 80% PI scores for each prediction
rand_clust_test_pred_int_df['int_95_score'] = interval_score(rand_clust_test_pred_int_df['actual'],
                                                             rand_clust_test_pred_int_df['lo_95'],
                                                             rand_clust_test_pred_int_df['hi_95'],
                                                             0.95
                                                            )

rand_clust_test_pred_int_df['int_80_score'] = interval_score(rand_clust_test_pred_int_df['actual'],
                                                             rand_clust_test_pred_int_df['lo_80'],
                                                             rand_clust_test_pred_int_df['hi_80'],
                                                             0.80
                                                            )

In [81]:
# Print the means of those PI scores
rand_clust_test_pred_int_df.mean()

ts_index         38.500000
test_preds      265.709929
lo_95           212.058140
hi_95           321.068173
lo_80           238.401754
hi_80           293.847885
actual          265.435072
int_95_score    233.998928
int_80_score    123.902825
dtype: float64

In [82]:
len(rand_clust_test_pred_int_df.ts_index.unique())

76

In [83]:
rand_clust_test_pred_int_df_grouped = rand_clust_test_pred_int_df.groupby("ts_index")\
.agg({'int_95_score':'mean', 'int_80_score':'mean', 'actual':'mean'}).reset_index()

rand_clust_test_pred_int_df_grouped['int_95_score_scaled'] = rand_clust_test_pred_int_df_grouped['int_95_score']/rand_clust_test_pred_int_df_grouped['actual']
rand_clust_test_pred_int_df_grouped['int_80_score_scaled'] = rand_clust_test_pred_int_df_grouped['int_80_score']/rand_clust_test_pred_int_df_grouped['actual']

In [84]:
rand_clust_test_pred_int_df_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.588199
int_95_score_scaled    1.052794
dtype: float64

In [85]:
# Save the PI data frame to a csv file
rand_clust_test_pred_int_df.to_csv("Results/Global/LightGBM Bayes/Random Cluster/test_pred_intervals.csv", 
                                   index=False)

# Train and Test - Highway System

In [86]:
# Delete unused variables
del train_df_rand_clust_ls
del train_df_rand_clust
del val_df_rand_clust_ls
del val_df_rand_clust
del rand_clust_mods_bayes
del rand_clust
del train_val_df_rand
del train_val_df_rand_ls
del rand_clust_mods_bayes_final
del rand_clust_mods_bayes_resid
del test_df_full_rand
del test_df_full_rand_ls
del rand_clust_mods_bayes_test_preds
del rand_clust_bayes_test_preds_df
del rand_clust_bayes_test_perf
del rand_clust_test_pred_int
del rand_clust_test_pred_int_df

In [87]:
# Garbage collect
gc.collect()

250

In [88]:
# Create cluster assignments for the highway systems based on the no of sensors for each system
highway_clust = pd.DataFrame({"ts_index": np.arange(1, 77),
                                    "cluster": [1]*38 + [2]*19 + [3]*19}
                            )

In [89]:
# Merge training, validation, train_val, and test data with cluster assignments
train_df_full_highway = train_df_full.merge(highway_clust, on="ts_index")
val_df_full_highway = val_df_full.merge(highway_clust, on="ts_index")
train_val_df_full_highway = train_val_df_full.merge(highway_clust, on="ts_index")
test_df_full_highway = test_df_full.merge(highway_clust, on="ts_index")

In [90]:
# Split the training and validation data frames into a list of data frames which each contain data for 1 cluster
train_df_highway_clust_ls = [df.reset_index(drop=True) for _,df in train_df_full_highway.groupby("cluster")]
val_df_highway_clust_ls = [df.reset_index(drop=True) for _,df in val_df_full_highway.groupby("cluster")]

In [91]:
# In parallel, loop through the lists of training and validation data, subset theminto X and y, and run the 
# Bayesian optimizer. Save the best model params for each cluster to a list
with tqdm_joblib(tqdm(desc="Highway System LGBM Models Bayes", 
                      total=len(train_df_highway_clust_ls))) as progress_bar:
    highway_clust_mods_bayes = Parallel(n_jobs=3)(delayed(optimize_lgbm_w_bayes)(train_df_highway_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                                 train_df_highway_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0],  
                                                                                 val_df_highway_clust_ls[i].iloc[:,1:(lag_n+1)],
                                                                                 val_df_highway_clust_ls[i].iloc[:,0]) for i in range(len(train_df_highway_clust_ls)))

Highway System LGBM Models Bayes:  67%|██████▋   | 2/3 [18:55<08:36, 516.75s/it]

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1667  [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [0m 2       [0m | [0m-0.1687  [0m | [0m 0.7999  [0m | [0m 0.4889  [0m | [0m 0.05053 [0m | [0m 0.2692  [0m | [0m 2.954   [0m | [0m 827.3   [0m | [0m 149.3   [0m |
| [0m 3       [0m | [0m-0.1949  [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.2138  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.186   [0m | [0m 0.746

Highway System LGBM Models Bayes: 100%|██████████| 3/3 [20:03<00:00, 401.25s/it]


In [92]:
# For each set of params in the list
for n in range(len(highway_clust_mods_bayes)):
    # Round and cast to int the LGBM model params which must be integers
    highway_clust_mods_bayes[n]["max_depth"] = int(round(highway_clust_mods_bayes[n]["max_depth"]))
    highway_clust_mods_bayes[n]["n_estimators"] = int(round(highway_clust_mods_bayes[n]["n_estimators"]))
    highway_clust_mods_bayes[n]["num_leaves"] = int(round(highway_clust_mods_bayes[n]["num_leaves"]))

In [93]:
# Create a list of train_val data frames which only contain data for each cluster
train_val_df_highway_clust_ls = [df.reset_index(drop=True) for _,df in train_val_df_full_highway.groupby("cluster")]

In [94]:
# Using the above list and the list of best model params, loop in parallel across the clusters and create a 
# model for each one. Save those models to a list
with tqdm_joblib(tqdm(desc="Highway System LGBM Models Bayes Final", 
                      total=len(highway_clust_mods_bayes))) as progress_bar:
    highway_clust_mods_bayes_final = Parallel(n_jobs=3)(delayed(train_lgbm)(highway_clust_mods_bayes[i], 
                                                                            train_val_df_highway_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                            train_val_df_highway_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0]
                                                                        ) for i in range(len(highway_clust_mods_bayes)))

Highway System LGBM Models Bayes Final: 100%|█████| 3/3 [01:14<00:00, 24.97s/it]


In [95]:
# Write each of those models to a file 
for model_no in range(len(highway_clust_mods_bayes_final)):
    fname = f"Results/Global/LightGBM Bayes/Highway System/model_{model_no}"
    joblib.dump(highway_clust_mods_bayes_final[model_no], fname)

In [96]:
highway_clust_mods_bayes_final = list()

for model_no in range(len(train_val_df_highway_clust_ls)):
    fname = f"Results/Global/LightGBM Bayes/Highway System/model_{model_no}"
    highway_clust_mods_bayes_final.append(joblib.load(fname))

In [97]:
# For each of the newly created models, in paralle, loop through the models and training data and
# compute the model residuals. Save the residuals from each model to a list
with tqdm_joblib(tqdm(desc="Highway LGBM Models Bayes Residuals", 
                      total=len(highway_clust_mods_bayes_final))) as progress_bar:
    highway_clust_mods_bayes_resid = Parallel(n_jobs=3)(delayed(compute_lgbm_residuals)(highway_clust_mods_bayes_final[i],
                                                                                     train_val_df_highway_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                                     train_val_df_highway_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0]) for i in range(len(highway_clust_mods_bayes_final)))

Highway LGBM Models Bayes Residuals: 100%|████████| 3/3 [00:09<00:00,  3.25s/it]


In [98]:
highway_res_df = pd.DataFrame({'cluster': list({(i+1): highway_clust_mods_bayes_resid[i] for i in range(len(highway_clust_mods_bayes_resid))}.keys()),
                               'residual': list({(i+1): highway_clust_mods_bayes_resid[i] for i in range(len(highway_clust_mods_bayes_resid))}.values())})

highway_res_df.head()

Unnamed: 0,cluster,residual
0,1,"[-10.47253259660829, 0.40237954593237646, -48...."
1,2,"[18.899379177321606, -8.598041296536792, 19.29..."
2,3,"[-27.959469759999138, 2.9877836473597768, 11.8..."


In [99]:
highway_res_df.to_csv("Results/Global/LightGBM Bayes/Highway System/residual.csv", index=False)

In [100]:
# Create a list of test data frames where each entry in the list is the test data frame for one cluster
test_df_full_highway_clust_ls = [df.reset_index(drop=True) for _,df in test_df_full_highway.groupby("cluster")]

In [101]:
# Loop through the models and the list of test data frames, create test predictions, and save those to a list of 
# data frames
with tqdm_joblib(tqdm(desc="Highway LGBM Models Bayes Test Preds", 
                      total=len(highway_clust_mods_bayes_final))) as progress_bar:
    highway_clust_mods_bayes_test_preds = Parallel(n_jobs=4)(delayed(compute_lgbm_test_preds)(highway_clust_mods_bayes_final[i],
                                                                                           test_df_full_highway_clust_ls[i],
                                                                                           lag_n
                                                                                          ) for i in range(len(highway_clust_mods_bayes_final)))

Highway LGBM Models Bayes Test Preds:   0%|               | 0/3 [00:00<?, ?it/s]

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.12    [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [95m 2       [0m | [95m-0.1194  [0m | [95m 0.7999  [0m | [95m 0.4889  [0m | [95m 0.05053 [0m | [95m 0.2692  [0m | [95m 2.954   [0m | [95m 827.3   [0m | [95m 149.3   [0m |
| [0m 3       [0m | [0m-0.1317  [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.1451  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.128   [0m | 

Highway LGBM Models Bayes Test Preds: 100%|███████| 3/3 [00:05<00:00,  1.75s/it]


In [102]:
# Create one data frame from the above list of test pred data frames
highway_clust_bayes_test_preds_df = pd.concat(highway_clust_mods_bayes_test_preds)
# for clust_test_pred_df in highway_clust_mods_bayes_test_preds:
#     highway_clust_bayes_test_preds_df = highway_clust_bayes_test_preds_df.append(clust_test_pred_df)

In [103]:
# Compute performance metrics on the full data frame of test predictions
highway_clust_bayes_test_perf = compute_lgbm_test_perf(highway_clust_bayes_test_preds_df,
                                                       test_df_full_highway)

In [104]:
# Compute normalized/scaled perf metrics
highway_clust_bayes_test_perf['nrmse'] = highway_clust_bayes_test_perf['rmse']/highway_clust_bayes_test_perf['mean']
highway_clust_bayes_test_perf['smae'] = highway_clust_bayes_test_perf['mae']/highway_clust_bayes_test_perf['mean']

In [105]:
# Print means of performance metrics
highway_clust_bayes_test_perf.mean()

rmse      31.115161
mae       20.665621
mean     265.435072
nrmse      0.141108
smae       0.095555
dtype: float64

In [106]:
# Loop through each set of preds and compute the bootstrap PIs for those preds/cluster
highway_clust_test_pred_int = list()
for i in range(len(highway_clust_mods_bayes_test_preds)):
    highway_clust_test_pred_int.append(compute_lgbm_boostrap_int(highway_clust_mods_bayes_test_preds[i], 
                                                                 highway_clust_mods_bayes_resid[i], 
                                                                 n_boot))

Pandas Apply:   0%|          | 0/51072 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/25536 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/25536 [00:00<?, ?it/s]

In [107]:
# For each cluster, add the true values to the data frame of preds
for n in range(1, len(highway_clust_test_pred_int)+1):
    y_actual_sub = test_df_full_highway.query("cluster==@n").copy().iloc[:,0].to_list()
    highway_clust_test_pred_int[n-1]['actual'] = y_actual_sub

In [108]:
# Append all PI data frames into one data frame
highway_clust_test_pred_int_df = pd.concat(highway_clust_test_pred_int)
# for clust_test_pred_int_df in highway_clust_test_pred_int:
#     highway_clust_test_pred_int_df = highway_clust_test_pred_int_df.append(clust_test_pred_int_df)

In [109]:
# For every prediction in the PI data frame, compute the 95% and 80% PI score
highway_clust_test_pred_int_df['int_95_score'] = interval_score(highway_clust_test_pred_int_df['actual'],
                                                                highway_clust_test_pred_int_df['lo_95'],
                                                                highway_clust_test_pred_int_df['hi_95'],
                                                                0.95
                                                               )

highway_clust_test_pred_int_df['int_80_score'] = interval_score(highway_clust_test_pred_int_df['actual'],
                                                                highway_clust_test_pred_int_df['lo_80'],
                                                                highway_clust_test_pred_int_df['hi_80'],
                                                                0.80
                                                               )
                                                                

In [110]:
# Print the means of the PI scores
highway_clust_test_pred_int_df.mean()

ts_index         38.500000
test_preds      265.684829
lo_95           205.315901
hi_95           328.288360
lo_80           236.420271
hi_80           296.055512
actual          265.435072
int_95_score    221.862267
int_80_score    122.267072
dtype: float64

In [111]:
highway_clust_test_pred_int_df_grouped = highway_clust_test_pred_int_df.groupby("ts_index")\
.agg({"int_95_score":'mean', 'int_80_score':'mean', 'actual':'mean'}).reset_index()

highway_clust_test_pred_int_df_grouped['int_95_score_scaled'] = highway_clust_test_pred_int_df_grouped['int_95_score']/highway_clust_test_pred_int_df_grouped['actual']
highway_clust_test_pred_int_df_grouped['int_80_score_scaled'] = highway_clust_test_pred_int_df_grouped['int_80_score']/highway_clust_test_pred_int_df_grouped['actual']

In [112]:
highway_clust_test_pred_int_df_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.589390
int_95_score_scaled    1.058783
dtype: float64

In [113]:
# Save the PI data frame to a csv file
highway_clust_test_pred_int_df.to_csv("Results/Global/LightGBM Bayes/Highway System/test_pred_intervals.csv",
                                      index=False)

# Test and Train - Catch22 KMeans

In [114]:
# Delete unused variables
del highway_clust_test_pred_int_df
del highway_clust_test_pred_int
del y_actual_sub
del highway_clust_bayes_test_perf
del highway_clust_bayes_test_preds_df
del highway_clust_mods_bayes_test_preds
del test_df_full_highway_clust_ls
del test_df_full_highway
del highway_clust_mods_bayes_resid
del highway_clust_mods_bayes_final
del train_val_df_highway_clust_ls
del train_val_df_full_highway
del highway_clust_mods_bayes
del train_df_highway_clust_ls
del val_df_highway_clust_ls 
del train_df_full_highway
del val_df_full_highway
del highway_clust

In [115]:
# Garbage collect
gc.collect()

188

In [116]:
# Read in the cluster assignmed from the Catch22-based KMeans clusters
catch22_clust = pd.read_csv("Results/Clustering/KMeans/kmeans_catch22_clustering_assign.csv")
catch22_clust['cluster'] = catch22_clust['kmeans_catch22_clust_assign']

In [117]:
# Merge the training, validation, train_val, and test data with the cluster assignments
train_df_full_catch22 = train_df_full.merge(catch22_clust, on="ts_index")
val_df_full_catch22 = val_df_full.merge(catch22_clust, on="ts_index")
train_val_df_full_catch22 = train_val_df_full.merge(catch22_clust, on="ts_index")
test_df_full_catch22 = test_df_full.merge(catch22_clust, on="ts_index")

In [118]:
# Create a list of training and validation data frames which contain data for only one cluster each
train_df_catch22_clust_ls = [df.reset_index(drop=True) for _,df in train_df_full_catch22.groupby("cluster")]
val_df_catch22_clust_ls = [df.reset_index(drop=True) for _,df in val_df_full_catch22.groupby("cluster")]

In [119]:
# In parallel, loop through the clusters and run the optimizer for a model for each cluster. Save best model
# params for each cluster to a list
with tqdm_joblib(tqdm(desc="Catch22 LGBM Models Bayes", 
                      total=len(train_df_catch22_clust_ls))) as progress_bar:
    catch22_clust_mods_bayes = Parallel(n_jobs=3)(delayed(optimize_lgbm_w_bayes)(train_df_catch22_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                                 train_df_catch22_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0],  
                                                                                 val_df_catch22_clust_ls[i].iloc[:,1:(lag_n+1)],
                                                                                 val_df_catch22_clust_ls[i].iloc[:,0]) for i in range(len(train_df_catch22_clust_ls)))

Catch22 LGBM Models Bayes:  40%|██████▊          | 2/5 [05:18<06:59, 139.84s/it]

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1674  [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [0m 2       [0m | [0m-0.1725  [0m | [0m 0.7999  [0m | [0m 0.4889  [0m | [0m 0.05053 [0m | [0m 0.2692  [0m | [0m 2.954   [0m | [0m 827.3   [0m | [0m 149.3   [0m |
| [0m 3       [0m | [0m-0.2048  [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.2554  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.1814  [0m | [0m 0.746

Catch22 LGBM Models Bayes:  60%|██████████▏      | 3/5 [16:37<12:51, 385.87s/it]

| [0m 12      [0m | [0m-0.1757  [0m | [0m 0.1508  [0m | [0m 0.1469  [0m | [0m 0.486   [0m | [0m 0.3686  [0m | [0m 14.24   [0m | [0m 471.9   [0m | [0m 26.13   [0m |
| [95m 13      [0m | [95m-0.1364  [0m | [95m 0.9414  [0m | [95m 0.8376  [0m | [95m 0.004513[0m | [95m 0.07853 [0m | [95m 5.527   [0m | [95m 832.1   [0m | [95m 149.5   [0m |
| [95m 14      [0m | [95m-0.1362  [0m | [95m 0.2685  [0m | [95m 0.5623  [0m | [95m 0.2073  [0m | [95m 0.1042  [0m | [95m 2.002   [0m | [95m 833.9   [0m | [95m 141.2   [0m |
| [0m 15      [0m | [0m-0.1793  [0m | [0m 0.114   [0m | [0m 0.6266  [0m | [0m 0.4696  [0m | [0m 0.3756  [0m | [0m 5.003   [0m | [0m 836.3   [0m | [0m 140.7   [0m |
| [0m 16      [0m | [0m-0.9224  [0m | [0m 1.0     [0m | [0m 0.7244  [0m | [0m 1.0     [0m | [0m 1e-05   [0m | [0m 16.56   [0m | [0m 470.8   [0m | [0m 20.49   [0m |
| [0m 17      [0m | [0m-0.1669  [0m | [0m 0.8119  [0m | [0m 0.396

Catch22 LGBM Models Bayes: 100%|█████████████████| 5/5 [24:14<00:00, 291.00s/it]

| [0m 12      [0m | [0m-0.1078  [0m | [0m 0.9342  [0m | [0m 0.859   [0m | [0m 0.258   [0m | [0m 0.1979  [0m | [0m 23.89   [0m | [0m 473.3   [0m | [0m 20.39   [0m |
| [0m 13      [0m | [0m-0.1103  [0m | [0m 0.957   [0m | [0m 0.5047  [0m | [0m 0.3459  [0m | [0m 0.2706  [0m | [0m 8.573   [0m | [0m 757.1   [0m | [0m 17.0    [0m |
| [0m 14      [0m | [0m-0.1191  [0m | [0m 0.2656  [0m | [0m 0.04668 [0m | [0m 0.03001 [0m | [0m 0.1727  [0m | [0m 14.97   [0m | [0m 861.6   [0m | [0m 139.5   [0m |
| [0m 15      [0m | [0m-0.1143  [0m | [0m 0.8199  [0m | [0m 0.4933  [0m | [0m 0.7328  [0m | [0m 0.2643  [0m | [0m 14.34   [0m | [0m 398.9   [0m | [0m 92.87   [0m |
| [0m 16      [0m | [0m-0.1183  [0m | [0m 0.4781  [0m | [0m 0.1134  [0m | [0m 0.3489  [0m | [0m 0.2055  [0m | [0m 10.65   [0m | [0m 821.2   [0m | [0m 137.4   [0m |
| [0m 17      [0m | [0m-0.12    [0m | [0m 0.8119  [0m | [0m 0.3965  [0m | [0m 0.7




In [120]:
# Convert params for each model to integer where necessary 
for n in range(len(catch22_clust_mods_bayes)):
    catch22_clust_mods_bayes[n]["max_depth"] = int(round(catch22_clust_mods_bayes[n]["max_depth"]))
    catch22_clust_mods_bayes[n]["n_estimators"] = int(round(catch22_clust_mods_bayes[n]["n_estimators"]))
    catch22_clust_mods_bayes[n]["num_leaves"] = int(round(catch22_clust_mods_bayes[n]["num_leaves"]))

In [121]:
# Split the train_val data into a list of data frames as well
train_val_df_catch22_clust_ls = [df.reset_index(drop=True) for _,df in train_val_df_full_catch22.groupby("cluster")]

In [122]:
# Using the train_val data, compute a final model for each cluster
with tqdm_joblib(tqdm(desc="Catch22 LGBM Models Bayes Final", 
                      total=len(catch22_clust_mods_bayes))) as progress_bar:
    catch22_clust_mods_bayes_final = Parallel(n_jobs=3)(delayed(train_lgbm)(catch22_clust_mods_bayes[i], 
                                                                            train_val_df_catch22_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                            train_val_df_catch22_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0]
                                                                        ) for i in range(len(catch22_clust_mods_bayes)))

Catch22 LGBM Models Bayes Final:  80%|█████████▌  | 4/5 [00:42<00:10, 10.06s/it]



Catch22 LGBM Models Bayes Final: 100%|████████████| 5/5 [00:49<00:00,  9.87s/it]


In [123]:
# Save the final models to files
for model_no in range(len(catch22_clust_mods_bayes_final)):
    fname = f"Results/Global/LightGBM Bayes/Catch22 KMeans/model_{model_no}"
    joblib.dump(catch22_clust_mods_bayes_final[model_no], fname)

In [124]:
catch22_clust_mods_bayes_final = list()

for model_no in range(len(train_val_df_catch22_clust_ls)):
    fname = f"Results/Global/LightGBM Bayes/Catch22 KMeans/model_{model_no}"
    catch22_clust_mods_bayes_final.append(joblib.load(fname))

In [125]:
# For each model, compute the model's residuals and save to a list
with tqdm_joblib(tqdm(desc="Catch22 LGBM Models Bayes Residuals", 
                      total=len(catch22_clust_mods_bayes_final))) as progress_bar:
    catch22_clust_mods_bayes_resid = Parallel(n_jobs=3)(delayed(compute_lgbm_residuals)(catch22_clust_mods_bayes_final[i],
                                                                                     train_val_df_catch22_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                                     train_val_df_catch22_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0]) for i in range(len(catch22_clust_mods_bayes_final)))

Catch22 LGBM Models Bayes Residuals: 100%|████████| 5/5 [00:09<00:00,  1.83s/it]


In [126]:
catch22_res_df = pd.DataFrame({'cluster': list({(i+1): catch22_clust_mods_bayes_resid[i] for i in range(len(catch22_clust_mods_bayes_resid))}.keys()),
                               'residual': list({(i+1): catch22_clust_mods_bayes_resid[i] for i in range(len(catch22_clust_mods_bayes_resid))}.values())})

catch22_res_df.head()

Unnamed: 0,cluster,residual
0,1,"[3.1674887922998707, -3.919883131163189, -8.96..."
1,2,"[1.3179691192486302, -23.649895182220007, -13...."
2,3,"[0.12009538735095271, -1.0302712135609917, -0...."
3,4,"[8.409688232619374, 26.361903022045055, -33.31..."
4,5,"[13.271590985891493, 15.492001013611684, 9.594..."


In [127]:
catch22_res_df.to_csv("Results/Global/LightGBM Bayes/Catch22 KMeans/residual.csv", index=False)

In [128]:
# Split the test data into a list of data frames, one for each cluster
test_df_full_catch22_clust_ls = [df.reset_index(drop=True) for _,df in test_df_full_catch22.groupby("cluster")]

In [129]:
# For each cluster, compute the model's test predictions
with tqdm_joblib(tqdm(desc="Catch22 LGBM Models Bayes Test Preds", 
                      total=len(catch22_clust_mods_bayes_final))) as progress_bar:
    catch22_clust_mods_bayes_test_preds = Parallel(n_jobs=4)(delayed(compute_lgbm_test_preds)(catch22_clust_mods_bayes_final[i],
                                                                                              test_df_full_catch22_clust_ls[i],
                                                                                              lag_n
                                                                                             ) for i in range(len(catch22_clust_mods_bayes_final)))

Catch22 LGBM Models Bayes Test Preds:   0%|               | 0/5 [00:00<?, ?it/s]

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1453  [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [0m 2       [0m | [0m-0.1481  [0m | [0m 0.7999  [0m | [0m 0.4889  [0m | [0m 0.05053 [0m | [0m 0.2692  [0m | [0m 2.954   [0m | [0m 827.3   [0m | [0m 149.3   [0m |
| [0m 3       [0m | [0m-0.1646  [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.1794  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.1578  [0m | [0m 0.746

Catch22 LGBM Models Bayes Test Preds: 100%|███████| 5/5 [00:05<00:00,  1.05s/it]


In [130]:
# Append all data frames from the above list into one data frame of test predictions
catch22_clust_bayes_test_preds_df = pd.concat(catch22_clust_mods_bayes_test_preds)
# for clust_test_pred_df in catch22_clust_mods_bayes_test_preds:
#     catch22_clust_bayes_test_preds_df = catch22_clust_bayes_test_preds_df.append(clust_test_pred_df)

In [131]:
# Compute test pred performance
catch22_clust_bayes_test_perf = compute_lgbm_test_perf(catch22_clust_bayes_test_preds_df,
                                                       test_df_full_catch22)

In [132]:
# Add normalized performance metrics to the performance data frame
catch22_clust_bayes_test_perf['nrmse'] = catch22_clust_bayes_test_perf['rmse']/catch22_clust_bayes_test_perf['mean']
catch22_clust_bayes_test_perf['smae'] = catch22_clust_bayes_test_perf['mae']/catch22_clust_bayes_test_perf['mean']

In [133]:
# Print the means of perf metrics
catch22_clust_bayes_test_perf.mean()

rmse      31.191387
mae       20.734501
mean     265.435072
nrmse      0.140589
smae       0.095014
dtype: float64

In [134]:
# Loop through the model preds and residuals, and create a df of bootstrap PIs for each prediction
# save to a list of data frames
catch22_clust_test_pred_int = list()
for i in range(len(catch22_clust_mods_bayes_test_preds)):
    catch22_clust_test_pred_int.append(compute_lgbm_boostrap_int(catch22_clust_mods_bayes_test_preds[i], 
                                                                 catch22_clust_mods_bayes_resid[i], 
                                                                 n_boot))

Pandas Apply:   0%|          | 0/49728 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/4032 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/5376 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/26880 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/16128 [00:00<?, ?it/s]

In [135]:
# For each cluster, add the true values for y to the data frame in a new column called actual
for n in range(1, len(catch22_clust_test_pred_int)+1):
    y_actual_sub = test_df_full_catch22.query("cluster==@n").copy().iloc[:,0].to_list()
    catch22_clust_test_pred_int[n-1]['actual'] = y_actual_sub

In [136]:
# Append all PI data frames into one data frame
catch22_clust_test_pred_int_df = pd.concat(catch22_clust_test_pred_int)
# for clust_test_pred_int_df in catch22_clust_test_pred_int:
#     catch22_clust_test_pred_int_df = catch22_clust_test_pred_int_df.append(clust_test_pred_int_df)

In [137]:
# On that one data frame, compute the 95% and 80% PI scores for each observation
catch22_clust_test_pred_int_df['int_95_score'] = interval_score(catch22_clust_test_pred_int_df['actual'],
                                                                catch22_clust_test_pred_int_df['lo_95'],
                                                                catch22_clust_test_pred_int_df['hi_95'],
                                                                0.95
                                                               )

catch22_clust_test_pred_int_df['int_80_score'] = interval_score(catch22_clust_test_pred_int_df['actual'],
                                                                catch22_clust_test_pred_int_df['lo_80'],
                                                                catch22_clust_test_pred_int_df['hi_80'],
                                                                0.80
                                                               )

In [138]:
# Print the mean of the PI scores
catch22_clust_test_pred_int_df.mean()

ts_index         38.500000
test_preds      265.756515
lo_95           208.958208
hi_95           324.605175
lo_80           236.785350
hi_80           295.399973
actual          265.435072
int_95_score    216.345320
int_80_score    120.236059
dtype: float64

In [139]:
catch22_clust_test_pred_int_df_grouped = catch22_clust_test_pred_int_df.groupby("ts_index")\
.agg({"int_95_score": "mean", "int_80_score": "mean", "actual": "mean"}).reset_index()

catch22_clust_test_pred_int_df_grouped['int_95_score_scaled'] = catch22_clust_test_pred_int_df_grouped['int_95_score']/catch22_clust_test_pred_int_df_grouped['actual']
catch22_clust_test_pred_int_df_grouped['int_80_score_scaled'] = catch22_clust_test_pred_int_df_grouped['int_80_score']/catch22_clust_test_pred_int_df_grouped['actual']

In [140]:
catch22_clust_test_pred_int_df_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.545119
int_95_score_scaled    0.943197
dtype: float64

In [141]:
# Save PI df to csv
catch22_clust_test_pred_int_df.to_csv("Results/Global/LightGBM Bayes/Catch22 KMeans/test_pred_intervals.csv",
                                      index=False)

# Test and Train - TSFeat KMeans

In [142]:
# Delete variables that are no longer needed
del catch22_clust_test_pred_int_df
del catch22_clust_test_pred_int
del y_actual_sub
del catch22_clust_bayes_test_perf
del catch22_clust_bayes_test_preds_df
del catch22_clust_mods_bayes_test_preds
del test_df_full_catch22_clust_ls
del test_df_full_catch22
del catch22_clust_mods_bayes_resid
del catch22_clust_mods_bayes_final
del train_val_df_catch22_clust_ls
del train_val_df_full_catch22
del catch22_clust_mods_bayes
del train_df_catch22_clust_ls
del val_df_catch22_clust_ls 
del train_df_full_catch22
del val_df_full_catch22
del catch22_clust

In [143]:
# Run the garbage collector
gc.collect()

286

In [144]:
# Read in the cluster assignments for the KMeans clusted based on tsfeat feature set
tsfeat_clust = pd.read_csv("Results/Clustering/KMeans/kmeans_tsfeat_clustering_assign.csv")
tsfeat_clust['cluster'] =  tsfeat_clust['kmeans_tsfeat_clust_assign']

In [145]:
# Merge the train, val, train_val, and test data frames with the cluster assignments
train_df_full_tsfeat = train_df_full.merge(tsfeat_clust, on="ts_index")
val_df_full_tsfeat = val_df_full.merge(tsfeat_clust, on="ts_index")
train_val_df_full_tsfeat = train_val_df_full.merge(tsfeat_clust, on="ts_index")
test_df_full_tsfeat = test_df_full.merge(tsfeat_clust, on="ts_index")

In [146]:
# Create lists of data frames for training and validation, where each df in the list is data for one cluster
train_df_tsfeat_clust_ls = [df.reset_index(drop=True) for _,df in train_df_full_tsfeat.groupby("cluster")]
val_df_tsfeat_clust_ls = [df.reset_index(drop=True) for _,df in val_df_full_tsfeat.groupby("cluster")]

In [147]:
# Run the Bayesian optimizer, in parallel, for each cluster
with tqdm_joblib(tqdm(desc="TSFeat LGBM Models Bayes", 
                      total=len(train_df_tsfeat_clust_ls))) as progress_bar:
    tsfeat_clust_mods_bayes = Parallel(n_jobs=2)(delayed(optimize_lgbm_w_bayes)(train_df_tsfeat_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                                 train_df_tsfeat_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0],  
                                                                                 val_df_tsfeat_clust_ls[i].iloc[:,1:(lag_n+1)],
                                                                                 val_df_tsfeat_clust_ls[i].iloc[:,0]) for i in range(len(train_df_tsfeat_clust_ls)))

TSFeat LGBM Models Bayes:  50%|█████████         | 1/2 [12:08<12:08, 728.13s/it]

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.2024  [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [0m 2       [0m | [0m-0.2063  [0m | [0m 0.7999  [0m | [0m 0.4889  [0m | [0m 0.05053 [0m | [0m 0.2692  [0m | [0m 2.954   [0m | [0m 827.3   [0m | [0m 149.3   [0m |
| [0m 3       [0m | [0m-0.229   [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.2466  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.219   [0m | [0m 0.746

TSFeat LGBM Models Bayes: 100%|██████████████████| 2/2 [18:01<00:00, 540.88s/it]


In [148]:
# For each set of params returned by the optimizer, convert the required parameters to integers
for n in range(len(tsfeat_clust_mods_bayes)):
    tsfeat_clust_mods_bayes[n]["max_depth"] = int(round(tsfeat_clust_mods_bayes[n]["max_depth"]))
    tsfeat_clust_mods_bayes[n]["n_estimators"] = int(round(tsfeat_clust_mods_bayes[n]["n_estimators"]))
    tsfeat_clust_mods_bayes[n]["num_leaves"] = int(round(tsfeat_clust_mods_bayes[n]["num_leaves"]))

In [149]:
# Split the train_val df into a list of data frames, one df per cluster
train_val_df_tsfeat_clust_ls = [df.reset_index(drop=True) for _,df in train_val_df_full_tsfeat.groupby("cluster")]

In [150]:
# In parallel, fit a model to each train_val df using the params found by the Bayesian optimizer
with tqdm_joblib(tqdm(desc="TSFeat LGBM Models Bayes Final", 
                      total=len(tsfeat_clust_mods_bayes))) as progress_bar:
    tsfeat_clust_mods_bayes_final = Parallel(n_jobs=2)(delayed(train_lgbm)(tsfeat_clust_mods_bayes[i], 
                                                                            train_val_df_tsfeat_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                            train_val_df_tsfeat_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0]
                                                                        ) for i in range(len(tsfeat_clust_mods_bayes)))

TSFeat LGBM Models Bayes Final: 100%|█████████████| 2/2 [00:53<00:00, 26.94s/it]


In [151]:
# Save those models to files
for model_no in range(len(tsfeat_clust_mods_bayes_final)):
    fname = f"Results/Global/LightGBM Bayes/TSFeat KMeans/model_{model_no}"
    joblib.dump(tsfeat_clust_mods_bayes_final[model_no], fname)

In [152]:
tsfeat_clust_mods_bayes_final = list()

for model_no in range(len(train_val_df_tsfeat_clust_ls)):
    fname = f"Results/Global/LightGBM Bayes/TSFeat KMeans/model_{model_no}"
    tsfeat_clust_mods_bayes_final.append(joblib.load(fname))

In [154]:
# For each model, compute the residuals and save the results into a list
with tqdm_joblib(tqdm(desc="TSFeat LGBM Models Bayes Residuals", 
                      total=len(tsfeat_clust_mods_bayes_final))) as progress_bar:
    tsfeat_clust_mods_bayes_resid = Parallel(n_jobs=2)(delayed(compute_lgbm_residuals)(tsfeat_clust_mods_bayes_final[i],
                                                                                     train_val_df_tsfeat_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                                     train_val_df_tsfeat_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0]) for i in range(len(tsfeat_clust_mods_bayes_final)))

TSFeat LGBM Models Bayes Residuals: 100%|█████████| 2/2 [00:11<00:00,  5.85s/it]


In [155]:
tsfeat_clust_res_df = pd.DataFrame({'cluster': list({(i+1): tsfeat_clust_mods_bayes_resid[i] for i in range(len(tsfeat_clust_mods_bayes_resid))}.keys()),
                                    'residual': list({(i+1): tsfeat_clust_mods_bayes_resid[i] for i in range(len(tsfeat_clust_mods_bayes_resid))}.values())})

tsfeat_clust_res_df.head()

Unnamed: 0,cluster,residual
0,1,"[-4.4743312200740775, 12.246247882660498, -37...."
1,2,"[36.57535456378076, -10.735407496008037, 15.87..."


In [156]:
tsfeat_clust_res_df.to_csv("Results/Global/LightGBM Bayes/TSFeat KMeans/residual.csv", index=False)

In [157]:
# Split the test df into a list of data frames as well, one df per cluster
test_df_full_tsfeat_clust_ls = [df.reset_index(drop=True) for _,df in test_df_full_tsfeat.groupby("cluster")]

In [158]:
# Loop through the models and test data frames and compute the test predictions
with tqdm_joblib(tqdm(desc="TSFeat LGBM Models Bayes Test Preds", 
                      total=len(tsfeat_clust_mods_bayes_final))) as progress_bar:
    tsfeat_clust_mods_bayes_test_preds = Parallel(n_jobs=2)(delayed(compute_lgbm_test_preds)(tsfeat_clust_mods_bayes_final[i],
                                                                                           test_df_full_tsfeat_clust_ls[i],
                                                                                           lag_n
                                                                                          ) for i in range(len(tsfeat_clust_mods_bayes_final)))

TSFeat LGBM Models Bayes Test Preds: 100%|████████| 2/2 [00:03<00:00,  1.91s/it]


In [159]:
# Create a new data frame to which the test preds from each cluster are appened
tsfeat_clust_bayes_test_preds_df = pd.concat(tsfeat_clust_mods_bayes_test_preds)
# for clust_test_pred_df in tsfeat_clust_mods_bayes_test_preds:
#     tsfeat_clust_bayes_test_preds_df = tsfeat_clust_bayes_test_preds_df.append(clust_test_pred_df)

In [160]:
# Compute test pred performance
tsfeat_clust_bayes_test_perf = compute_lgbm_test_perf(tsfeat_clust_bayes_test_preds_df,
                                                       test_df_full_tsfeat)

In [161]:
# Compute normalized performance metrics
tsfeat_clust_bayes_test_perf['nrmse'] = tsfeat_clust_bayes_test_perf['rmse']/tsfeat_clust_bayes_test_perf['mean']
tsfeat_clust_bayes_test_perf['smae'] = tsfeat_clust_bayes_test_perf['mae']/tsfeat_clust_bayes_test_perf['mean']

In [162]:
# Print the normalized performance metrics
tsfeat_clust_bayes_test_perf.mean()

rmse      30.818267
mae       20.440862
mean     265.435072
nrmse      0.139726
smae       0.094355
dtype: float64

In [163]:
# For each model/cluster, compute the PIs for the test preds via residual bootstrap. 
# Save the resulting data frames to a list
tsfeat_clust_test_pred_int = list()
for i in range(len(tsfeat_clust_mods_bayes_test_preds)):
    tsfeat_clust_test_pred_int.append(compute_lgbm_boostrap_int(tsfeat_clust_mods_bayes_test_preds[i], 
                                                                 tsfeat_clust_mods_bayes_resid[i], 
                                                                 n_boot))

Pandas Apply:   0%|          | 0/69888 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/32256 [00:00<?, ?it/s]

In [164]:
# For each PI data frame, grab the true value of the target for that cluster and add a df column for the true data
for n in range(1, len(tsfeat_clust_test_pred_int)+1):
    y_actual_sub = test_df_full_tsfeat.query("cluster==@n").copy().iloc[:,0].to_list()
    tsfeat_clust_test_pred_int[n-1]['actual'] = y_actual_sub

In [165]:
# Append all PI data frames into one
tsfeat_clust_test_pred_int_df = pd.concat(tsfeat_clust_test_pred_int)
# for clust_test_pred_int_df in tsfeat_clust_test_pred_int:
#     tsfeat_clust_test_pred_int_df = tsfeat_clust_test_pred_int_df.append(clust_test_pred_int_df)

In [166]:
# Compute the interval scores for each observation in that one df
tsfeat_clust_test_pred_int_df['int_95_score'] = interval_score(tsfeat_clust_test_pred_int_df['actual'],
                                                                tsfeat_clust_test_pred_int_df['lo_95'],
                                                                tsfeat_clust_test_pred_int_df['hi_95'],
                                                                0.95
                                                               )

tsfeat_clust_test_pred_int_df['int_80_score'] = interval_score(tsfeat_clust_test_pred_int_df['actual'],
                                                                tsfeat_clust_test_pred_int_df['lo_80'],
                                                                tsfeat_clust_test_pred_int_df['hi_80'],
                                                                0.80
                                                               )

In [167]:
# Print the mean PI scores
tsfeat_clust_test_pred_int_df.mean()

ts_index         38.500000
test_preds      265.591002
lo_95           208.690311
hi_95           324.623149
lo_80           236.951486
hi_80           295.044808
actual          265.435072
int_95_score    224.184974
int_80_score    121.068634
dtype: float64

In [168]:
tsfeat_clust_test_pred_int_df_grouped = tsfeat_clust_test_pred_int_df.groupby("ts_index")\
.agg({"int_95_score":"mean", "int_80_score":"mean", "actual":"mean"}).reset_index()

tsfeat_clust_test_pred_int_df_grouped['int_95_score_scaled'] = tsfeat_clust_test_pred_int_df_grouped['int_95_score']/tsfeat_clust_test_pred_int_df_grouped['actual']
tsfeat_clust_test_pred_int_df_grouped['int_80_score_scaled'] = tsfeat_clust_test_pred_int_df_grouped['int_80_score']/tsfeat_clust_test_pred_int_df_grouped['actual']

In [169]:
tsfeat_clust_test_pred_int_df_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.566261
int_95_score_scaled    1.003587
dtype: float64

In [170]:
# Save the PI df to a csv file
tsfeat_clust_test_pred_int_df.to_csv("Results/Global/LightGBM Bayes/TSFeat KMeans/test_pred_intervals.csv",
                                     index=False)

# Train and Test - DTW Clusters

In [171]:
# Delete variable which will no longer be used
del tsfeat_clust_test_pred_int_df
del tsfeat_clust_test_pred_int
del y_actual_sub
del tsfeat_clust_bayes_test_perf
del tsfeat_clust_bayes_test_preds_df
del tsfeat_clust_mods_bayes_test_preds
del test_df_full_tsfeat_clust_ls
del test_df_full_tsfeat
del tsfeat_clust_mods_bayes_resid
del tsfeat_clust_mods_bayes_final
del train_val_df_tsfeat_clust_ls
del train_val_df_full_tsfeat
del tsfeat_clust_mods_bayes
del train_df_tsfeat_clust_ls
del val_df_tsfeat_clust_ls 
del train_df_full_tsfeat
del val_df_full_tsfeat
del tsfeat_clust

In [172]:
# Run the garbage collector to ensure we are freeing up memory
gc.collect()

121

In [173]:
# Read in the cluster assignments for the DTW based clusters
dtw_clust = pd.read_csv("Results/Clustering/DTW/dtw_clustering_assign.csv")
dtw_clust['cluster'] =  dtw_clust['dtw_clust_assign']

In [174]:
# Merge train, val, train_val, and test data with cluster assignments
train_df_full_dtw = train_df_full.merge(dtw_clust, on="ts_index")
val_df_full_dtw = val_df_full.merge(dtw_clust, on="ts_index")
train_val_df_full_dtw = train_val_df_full.merge(dtw_clust, on="ts_index")
test_df_full_dtw = test_df_full.merge(dtw_clust, on="ts_index")

In [175]:
# Split the above data frames into lists of data frames where there is one df per cluster
train_df_dtw_clust_ls = [df.reset_index(drop=True) for _,df in train_df_full_dtw.groupby("cluster")]
val_df_dtw_clust_ls = [df.reset_index(drop=True) for _,df in val_df_full_dtw.groupby("cluster")]
train_val_df_dtw_clust_ls = [df.reset_index(drop=True) for _,df in train_val_df_full_dtw.groupby("cluster")]
test_df_full_dtw_clust_ls = [df.reset_index(drop=True) for _,df in test_df_full_dtw.groupby("cluster")]

In [176]:
# Loop through the clusters and run the optimizer for each cluster. Return a list of best model params
with tqdm_joblib(tqdm(desc="DTW LGBM Models Bayes", 
                      total=len(train_df_dtw_clust_ls))) as progress_bar:
    dtw_clust_mods_bayes = Parallel(n_jobs=2)(delayed(optimize_lgbm_w_bayes)(train_df_dtw_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                                 train_df_dtw_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0],  
                                                                                 val_df_dtw_clust_ls[i].iloc[:,1:(lag_n+1)],
                                                                                 val_df_dtw_clust_ls[i].iloc[:,0]) for i in range(len(train_df_dtw_clust_ls)))

DTW LGBM Models Bayes:  50%|██████████▌          | 1/2 [12:15<12:15, 735.72s/it]

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1062  [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [95m 2       [0m | [95m-0.1053  [0m | [95m 0.7999  [0m | [95m 0.4889  [0m | [95m 0.05053 [0m | [95m 0.2692  [0m | [95m 2.954   [0m | [95m 827.3   [0m | [95m 149.3   [0m |
| [0m 3       [0m | [0m-0.1186  [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.1347  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.1158  [0m | 

DTW LGBM Models Bayes: 100%|█████████████████████| 2/2 [20:22<00:00, 611.16s/it]


In [177]:
# Loop through the list of model params and convert to int where required
for n in range(len(dtw_clust_mods_bayes)):
    dtw_clust_mods_bayes[n]["max_depth"] = int(round(dtw_clust_mods_bayes[n]["max_depth"]))
    dtw_clust_mods_bayes[n]["n_estimators"] = int(round(dtw_clust_mods_bayes[n]["n_estimators"]))
    dtw_clust_mods_bayes[n]["num_leaves"] = int(round(dtw_clust_mods_bayes[n]["num_leaves"]))

In [178]:
# Loop through the clusters, and using the params found by the optimizer, train a final model for each cluster.
# Save to a list of models
with tqdm_joblib(tqdm(desc="DTW LGBM Models Bayes Final", 
                      total=len(dtw_clust_mods_bayes))) as progress_bar:
    dtw_clust_mods_bayes_final = Parallel(n_jobs=2)(delayed(train_lgbm)(dtw_clust_mods_bayes[i], 
                                                                            train_val_df_dtw_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                            train_val_df_dtw_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0]
                                                                        ) for i in range(len(dtw_clust_mods_bayes)))

DTW LGBM Models Bayes Final:  50%|████████        | 1/2 [00:37<00:37, 37.49s/it]

|   iter    |  target   | colsam... | lambda_l1 | lambda_l2 | learni... | max_depth | n_esti... | num_le... |
-------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-0.1471  [0m | [0m 0.9205  [0m | [0m 0.6238  [0m | [0m 0.7918  [0m | [0m 0.2149  [0m | [0m 14.49   [0m | [0m 472.1   [0m | [0m 21.99   [0m |
| [95m 2       [0m | [95m-0.146   [0m | [95m 0.7999  [0m | [95m 0.4889  [0m | [95m 0.05053 [0m | [95m 0.2692  [0m | [95m 2.954   [0m | [95m 827.3   [0m | [95m 149.3   [0m |
| [0m 3       [0m | [0m-0.163   [0m | [0m 0.6748  [0m | [0m 0.9459  [0m | [0m 0.7367  [0m | [0m 0.411   [0m | [0m 4.633   [0m | [0m 761.5   [0m | [0m 125.0   [0m |
| [0m 4       [0m | [0m-0.1788  [0m | [0m 0.1665  [0m | [0m 0.2079  [0m | [0m 0.8057  [0m | [0m 0.4572  [0m | [0m 5.424   [0m | [0m 962.3   [0m | [0m 62.04   [0m |
| [0m 5       [0m | [0m-0.1595  [0m | 

DTW LGBM Models Bayes Final: 100%|████████████████| 2/2 [01:20<00:00, 40.46s/it]


In [179]:
# Write the models to files
for model_no in range(len(dtw_clust_mods_bayes_final)):
    fname = f"Results/Global/LightGBM Bayes/DTW/model_{model_no}"
    joblib.dump(dtw_clust_mods_bayes_final[model_no], fname)

In [180]:
dtw_clust_mods_bayes_final = list()

for model_no in range(len(train_val_df_dtw_clust_ls)):
    fname = f"Results/Global/LightGBM Bayes/DTW/model_{model_no}"
    dtw_clust_mods_bayes_final.append(joblib.load(fname))

In [181]:
# For each model and train_val data used to train the model, compute the residuals. Save the residual list for
# each model as an entry in a list
with tqdm_joblib(tqdm(desc="DTW LGBM Models Bayes Residuals", 
                      total=len(dtw_clust_mods_bayes_final))) as progress_bar:
    dtw_clust_mods_bayes_resid = Parallel(n_jobs=3)(delayed(compute_lgbm_residuals)(dtw_clust_mods_bayes_final[i],
                                                                                     train_val_df_dtw_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,1:],
                                                                                     train_val_df_dtw_clust_ls[i].iloc[:,0:(lag_n+1)].dropna().iloc[:,0]) for i in range(len(dtw_clust_mods_bayes_final)))

DTW LGBM Models Bayes Residuals:   0%|                    | 0/2 [00:00<?, ?it/s]



DTW LGBM Models Bayes Residuals: 100%|████████████| 2/2 [00:14<00:00,  7.07s/it]


In [182]:
dtw_clust_res_df = pd.DataFrame({'cluster': list({(i+1): dtw_clust_mods_bayes_resid[i] for i in range(len(dtw_clust_mods_bayes_resid))}.keys()),
                                 'residual': list({(i+1): dtw_clust_mods_bayes_resid[i] for i in range(len(dtw_clust_mods_bayes_resid))}.values())})

dtw_clust_res_df.head()

Unnamed: 0,cluster,residual
0,1,"[8.044222078269485, 12.507955502283068, -28.70..."
1,2,"[-2.254609650429302, 27.416773801694262, -23.1..."


In [183]:
dtw_clust_res_df.to_csv("Results/Global/LightGBM Bayes/DTW/residual.csv", index=False)

In [184]:
# For each model, compute the predictions on the test data
with tqdm_joblib(tqdm(desc="DTW LGBM Models Bayes Test Preds", 
                      total=len(dtw_clust_mods_bayes_final))) as progress_bar:
    dtw_clust_mods_bayes_test_preds = Parallel(n_jobs=2)(delayed(compute_lgbm_test_preds)(dtw_clust_mods_bayes_final[i],
                                                                                           test_df_full_dtw_clust_ls[i],
                                                                                           lag_n
                                                                                          ) for i in range(len(dtw_clust_mods_bayes_final)))

DTW LGBM Models Bayes Test Preds: 100%|███████████| 2/2 [00:05<00:00,  2.90s/it]


In [185]:
# Create one data frame of test preds from the list created above
dtw_clust_bayes_test_preds_df = pd.concat(dtw_clust_mods_bayes_test_preds)
# for clust_test_pred_df in dtw_clust_mods_bayes_test_preds:
#     dtw_clust_bayes_test_preds_df = dtw_clust_bayes_test_preds_df.append(clust_test_pred_df)

In [186]:
# Compute test pred performance
dtw_clust_bayes_test_perf = compute_lgbm_test_perf(dtw_clust_bayes_test_preds_df,
                                                       test_df_full_dtw)

In [187]:
# Compute normalized performance metrics as well
dtw_clust_bayes_test_perf['nrmse'] = dtw_clust_bayes_test_perf['rmse']/dtw_clust_bayes_test_perf['mean']
dtw_clust_bayes_test_perf['smae'] = dtw_clust_bayes_test_perf['mae']/dtw_clust_bayes_test_perf['mean']

In [188]:
# Print means of perf metrics
dtw_clust_bayes_test_perf.mean()

rmse      30.315943
mae       20.017162
mean     265.435072
nrmse      0.136072
smae       0.091482
dtype: float64

In [189]:
# For each model/cluster, compute bootstrap PIs for each prediction from the test set. Save the data frames of PI's
# to a lift
dtw_clust_test_pred_int = list()
for i in range(len(dtw_clust_mods_bayes_test_preds)):
    dtw_clust_test_pred_int.append(compute_lgbm_boostrap_int(dtw_clust_mods_bayes_test_preds[i], 
                                                                 dtw_clust_mods_bayes_resid[i], 
                                                                 n_boot))

Pandas Apply:   0%|          | 0/33600 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/68544 [00:00<?, ?it/s]

In [190]:
# For each cluster's PI DF, add a column with the true value for each observation
for n in range(1, len(dtw_clust_test_pred_int)+1):
    y_actual_sub = test_df_full_dtw.query("cluster==@n").copy().iloc[:,0].to_list()
    dtw_clust_test_pred_int[n-1]['actual'] = y_actual_sub

In [191]:
# Append all PI data frames into one data frame
dtw_clust_test_pred_int_df = pd.concat(dtw_clust_test_pred_int)
# for clust_test_pred_int_df in dtw_clust_test_pred_int:
#     dtw_clust_test_pred_int_df = dtw_clust_test_pred_int_df.append(clust_test_pred_int_df)

In [192]:
# Compute the interval score for each observation's 95% and 80% PI
dtw_clust_test_pred_int_df['int_95_score'] = interval_score(dtw_clust_test_pred_int_df['actual'],
                                                                dtw_clust_test_pred_int_df['lo_95'],
                                                                dtw_clust_test_pred_int_df['hi_95'],
                                                                0.95
                                                               )

dtw_clust_test_pred_int_df['int_80_score'] = interval_score(dtw_clust_test_pred_int_df['actual'],
                                                                dtw_clust_test_pred_int_df['lo_80'],
                                                                dtw_clust_test_pred_int_df['hi_80'],
                                                                0.80
                                                               )

In [193]:
# Print the mean interval scores
dtw_clust_test_pred_int_df.mean()

ts_index         38.500000
test_preds      265.597540
lo_95           215.006362
hi_95           317.885495
lo_80           238.362433
hi_80           293.575718
actual          265.435072
int_95_score    202.360985
int_80_score    114.056671
dtype: float64

In [194]:
dtw_clust_test_pred_int_df_grouped = dtw_clust_test_pred_int_df.groupby("ts_index")\
.agg({"int_95_score":"mean", "int_80_score":"mean", "actual":"mean"}).reset_index()

dtw_clust_test_pred_int_df_grouped['int_95_score_scaled'] = dtw_clust_test_pred_int_df_grouped['int_95_score']/dtw_clust_test_pred_int_df_grouped['actual']
dtw_clust_test_pred_int_df_grouped['int_80_score_scaled'] = dtw_clust_test_pred_int_df_grouped['int_80_score']/dtw_clust_test_pred_int_df_grouped['actual']

In [195]:
dtw_clust_test_pred_int_df_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.536127
int_95_score_scaled    0.923342
dtype: float64

In [196]:
# Save the PI df to a csv file
dtw_clust_test_pred_int_df.to_csv("Results/Global/LightGBM Bayes/DTW/test_pred_intervals.csv",
                                     index=False)