In [1]:
# If the libraries are not yet installed, they can be installed in this notebook using commands similar to the below
# %conda install numpy
# %conda install pandas
# %conda install matplotlib
# %conda install scikit-learn
# %conda install -c conda-forge lightgbm 
# %conda install -c conda-forge swifter
# %conda install -c conda-forge scipy
# %conda install joblib
# %conda install tdqm

# Something like the following may also work if the above does not
# import sys
# !conda install --yes --prefix {sys.prefix} numpy
# !conda install --yes --prefix {sys.prefix} pandas
# !conda install --yes --prefix {sys.prefix} scikit-learn
# !conda install -c conda-forge --yes --prefix {sys.prefix} lightgbm
# !conda install -c conda-forge --yes --prefix {sys.prefix} swifter
# !conda install -c conda-forge --yes --prefix {sys.prefix} scipy 
# !conda install --yes --prefix {sys.prefix} joblib
# !conda install --yes --prefix {sys.prefix} tdqm

# To install a specific version, add the version to the install command
# E.g., %conda install numpy=1.20.3

# If all else fails, use pip or follow additional advice such as found at
# https://jakevdp.github.io/blog/2017/12/05/installing-python-packages-from-jupyter/

# If your plan to use pip (especially if you are not working within a specified conda environment), 
# the pip commands might look like:
# pip install numpy
# pip install pandas
# pip install scikit-learn
# pip install lightgbm
# pip install swifter
# pip install scipy
# pip install joblib
# pip install tdqm

# To install a specific version, add the version to the pip install command
# E.g., pip install numpy==1.20.3

In [2]:
import time
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import itertools
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import glob
from lightgbm import LGBMRegressor
import random
from sklearn.model_selection import ParameterSampler
import scipy
import gc
from joblib import Parallel, delayed
import contextlib
import joblib
from tqdm import tqdm
import os

In [3]:
np.random.seed(54321)
random.seed(54321)

In [4]:
# Create directories to save results
os.makedirs("Results/Global/LightGBM Default/Full", exist_ok=True)
os.makedirs("Results/Global/LightGBM Default/Random Cluster", exist_ok=True)
os.makedirs("Results/Global/LightGBM Default/Highway System", exist_ok=True)
os.makedirs("Results/Global/LightGBM Default/Catch22 KMeans", exist_ok=True)
os.makedirs("Results/Global/LightGBM Default/TSFeat KMeans", exist_ok=True)
os.makedirs("Results/Global/LightGBM Default/DTW", exist_ok=True)

# Read in Data and Prepare for Modeling

In [5]:
# Create an empty list to hold the dataframes of highways england data
england_df_list = list()

# Loop through the files, sorted in alphabetical order
# Read them into a df, make sure they are sorted by timestamp, and append to the list
for fname in sorted(glob.glob("Data/Processed/Highways_England/*.csv")):
    print("Reading {}".format(fname))
    df = pd.read_csv(fname) #, parse_dates=['timestamp'], index_col=['timestamp'])
    df = df.sort_values(by="timestamp")
    england_df_list.append(df)

Reading Data/Processed/Highways_England/A11-6310-1_Southbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A11-6312-2_Northbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A14-1107A_Eastbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A14-1144B_Westbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A1M-9842B_Southbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A1M-9847a_Northbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A46-7636-1_Northbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A46-7636-2_Southbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A47-6337-1_Westbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A47-6337-2_Eastbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A5-6847-2_Southbound_2019_Processed.csv
Reading Data/Processed/Highways_England/A5-7572-1-Northbound_2019_Processed.csv
Reading Data/Processed/Highways_Englan

In [6]:
# Follow the same process in this cell and the next as was done above, just for other highway systems
portland_df_list = list()

for fname in sorted(glob.glob("Data/Processed/Portland/*.csv")):
    print("Reading {}".format(fname))
    df = pd.read_csv(fname) #, parse_dates=['timestamp'], index_col=['timestamp'])
    df = df.sort_values(by="timestamp")
    portland_df_list.append(df)

Reading Data/Processed/Portland/I205-101068_Northbound_2019_Processed.csv
Reading Data/Processed/Portland/I205-101073_Southbound_2019_Processed.csv
Reading Data/Processed/Portland/I405-100395_Southbound_2019_Processed.csv
Reading Data/Processed/Portland/I405-100527_Northbound_2019_Processed.csv
Reading Data/Processed/Portland/I5-100688_Southbound_2019_Processed.csv
Reading Data/Processed/Portland/I5-100703_Northbound_2019_Processed.csv
Reading Data/Processed/Portland/I84-101108_Eastbound_2019_Processed.csv
Reading Data/Processed/Portland/I84-101161_Westbound_2019_Processed.csv
Reading Data/Processed/Portland/OR217-100300_Southbound_2019_Processed.csv
Reading Data/Processed/Portland/OR217-100314_Northbound_2019_Processed.csv
Reading Data/Processed/Portland/R2 Delta Hwy-101745_Northbound_2019_Processed.csv
Reading Data/Processed/Portland/R2 OR18-102111_Westbound_2019_Processed.csv
Reading Data/Processed/Portland/R2 OR18-102113_Eastbound_2019_Processed.csv
Reading Data/Processed/Portland/

In [7]:
utah_df_list = list()

for fname in sorted(glob.glob("Data/Processed/Utah/*.csv")):
    print("Reading {}".format(fname))
    df = pd.read_csv(fname) #, parse_dates=['timestamp'], index_col=['timestamp'])
    df = df.sort_values(by="timestamp")
    utah_df_list.append(df)

Reading Data/Processed/Utah/I15-3103178_Southbound_2019_Processed.csv
Reading Data/Processed/Utah/I15-749_Northbound_2019_Processed.csv
Reading Data/Processed/Utah/I215-134_Counterclockwise_2019_Processed.csv
Reading Data/Processed/Utah/I215-31_Clockwise_2019_Processed.csv
Reading Data/Processed/Utah/I70-3103400_Westbound_2019_Processed.csv
Reading Data/Processed/Utah/I70-3103401_Eastbound_2019_Processed.csv
Reading Data/Processed/Utah/I80-600_Eastbound_2019_Processed.csv
Reading Data/Processed/Utah/I80-667_Westbound_2019_Processed.csv
Reading Data/Processed/Utah/I84-451_Eastbound_2019_Processed.csv
Reading Data/Processed/Utah/I84-482_Westbound_2019_Processed.csv
Reading Data/Processed/Utah/LegacyParkway-810_Northbound_2019_Processed.csv
Reading Data/Processed/Utah/LegacyParkway-890_Southbound_2019_Processed.csv
Reading Data/Processed/Utah/US189-260_Westbound_2019_Processed.csv
Reading Data/Processed/Utah/US189-470_Eastbound_2019_Processed.csv
Reading Data/Processed/Utah/US40-634_Westb

In [8]:
# Append all df lists together into one
total_df_list = england_df_list + portland_df_list + utah_df_list

In [9]:
# Read in the start and end points csv, and subtract 1 to deal with index differences between R and python
start_end = pd.read_csv("start_end_points.csv")
start_end["start"] = start_end["start"] - 1
start_end["end"] = start_end["end"]

In [10]:
# Create an empty list to hold the subset data frames (those with only 12 weeks of data per highway)
subset_df_list = list()

In [11]:
# For each df in our original total df list
for idx, df in enumerate(total_df_list):
        
    # Filter the timeframe based on the start_end_points csv files
    subset_df = df.iloc[start_end.iloc[idx,0]:start_end.iloc[idx,1], ]\
    .reset_index(drop=True).reset_index(drop=False)\
    .rename(columns={"index":"rn"})
    
    # Create a new field called train_val_test to differentiate each set of data
    subset_df["train_val_test"] = np.where(subset_df["rn"]<(96*7*8),
                                           "train",
                                           np.where(subset_df["rn"]<(96*7*10),
                                                    "val",
                                                    "test"
                                                   )
                                       )
    
    # Append to list
    subset_df_list.append(subset_df)

In [12]:
# Create a list of df's with only fields we need

# Initialize empty list
model_df_list = list()

# For df in subset list
for df in subset_df_list:
       
    # Extract the timestamp, the volume, and the train_val_test assignment
    model_df = df[['timestamp', 'total_volume', "train_val_test"]]\
    .rename(columns={'timestamp':'start', 'total_volume':'target'})
    
    # Append this df to the new list
    model_df_list.append(model_df)

# Helper Function

In [13]:
# Code for progress bar:
# https://stackoverflow.com/questions/24983493/tracking-progress-of-joblib-parallel-execution
# This allows us to print a progress bar while running parallel loops using joblib 

@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

## Create Lag Emebedded Matrices for each TS

In [14]:
# # Lag embed the data frames and save to a list
lag_embed_df_list = list()

for df in model_df_list:
    # For each df in our list
    for n in range(1, 961):
        # For each lag level, up to 960 (the max lag we will test)
        # Create a new column called target-n
        name = f"target-{n}"
        # Save the target shifted n values into this colume
        df[name] = df['target'].shift(n)
    # Append to list
    lag_embed_df_list.append(df)

  df[name] = df['target'].shift(n)


In [15]:
# Split the lag embedded list into train, val, and test lists

# First, initialize empty lists for each train, val, and test
train_df_list = list()
val_df_list = list()
test_df_list = list()

for i in range(len(lag_embed_df_list)):
    # For each df in our list
    df = lag_embed_df_list[i].copy()

    # Add a ts_index of i+1 to join with clustering data from R
    df['ts_index'] = i + 1
    
    # Subset into train, val, and test df's based on the train_val_test_field
    train_df = df.query("train_val_test == 'train'").copy()
    val_df = df.query("train_val_test=='val'").copy()
    test_df = df.query("train_val_test=='test'").copy()
    
    # Append to appropriate lists
    train_df_list.append(train_df)
    val_df_list.append(val_df)
    test_df_list.append(test_df)

In [16]:
# Concat all dfs from the lists together to create one full train, val, and test df
train_df_full = pd.concat(train_df_list)
val_df_full = pd.concat(val_df_list)
test_df_full = pd.concat(test_df_list)

In [17]:
# Drop unneeded columns
train_df_full.drop(columns=['start', 'train_val_test'], inplace=True)
val_df_full.drop(columns=['start', 'train_val_test'], inplace=True)
test_df_full.drop(columns=['start', 'train_val_test'], inplace=True)

In [18]:
# Append the training and validation data together for later use
train_val_df_full = train_df_full.append(val_df_full)

In [19]:
# Delete unused variables to free up memory
del train_df_list
del val_df_list 
del test_df_list
del lag_embed_df_list
del model_df_list
del subset_df_list
del total_df_list
del england_df_list
del portland_df_list
del utah_df_list

In [20]:
# Force garbage collection to free up memory
gc.collect()

0

In [21]:
train_df_full.shape[0]/76/96

56.0

In [22]:
train_val_df_full.shape[0]/76/96

70.0

In [23]:
val_df_full.shape[0]/76/96

14.0

In [24]:
test_df_full.shape[0]/76/96

14.0

# Use Train-Val Data to Validate the 840 Lag Embedding Choice

In [25]:
# Create a list of all lag embeddings to test - they are generally multiples of 96 (or 96*1.25), the seasonality
lag_embed_list = [1,2,4,24,48,60,96,120,192,240,288,360,384,480,576,600,672,720,768,840,960]

In [26]:
# Define a function to train a LighGBM model given some data and a lag values
def train_lgbm_lag(lag, data):
    """Function which takes in a time series lag at which to compute a model and the lag embedded 
    training dataframe to use and returns a fitted LightGBM model
    """
    # Subset y and X from the data input for the given lag
    y_train = data.iloc[:,0:(lag+1)].dropna().iloc[:,0]
    X_train = data.iloc[:,0:(lag+1)].dropna().iloc[:,1:]
    
    # Create the model using boosting type goss, the true LightGBM booster and a fixed random state
    lgbm_mod = LGBMRegressor(boosting_type='goss', random_state=54321)
    # Fit the model
    lgbm_mod.fit(X_train, y_train)
    
    # Return the fitted model
    return lgbm_mod

In [27]:
# With the appropriate context manager, loop through all lag embeddings of interest and train a model
# We do this using Parallel so that we can loop in parallel and achieve faster compute time
with tqdm_joblib(tqdm(desc="Lag Embed LGBM Models", 
                      total=len(lag_embed_list))) as progress_bar:
    lag_embed_mods = Parallel(n_jobs=4)(delayed(train_lgbm_lag)(lag_embed_list[i], train_df_full) for i in range(len(lag_embed_list)))

Lag Embed LGBM Models: 100%|████████████████████| 21/21 [02:28<00:00,  7.07s/it]


In [28]:
# Define a function which, given a model, lag embed level, and training and validation data, returns
# average model performance over that data

def train_val_lgbm_perf_lag(model, lag, train_data, val_data):
    """Function which takes a trained model, the data set time series lag embedded, and a lag embedded
    training and validation data frame and computes average model performance across each time series
    in the data set"""
    
    # Set up empty lists in which we will save model peroformance metrics
    train_rmse_sub_list = list()
    train_mae_sub_list = list()
    train_rmse_scaled_sub_list = list()
    train_mae_scaled_sub_list = list()
    
    val_rmse_sub_list = list()
    val_mae_sub_list = list()
    val_rmse_scaled_sub_list = list()
    val_mae_scaled_sub_list = list()
    
    # Loop through each time series index. We know we have 76 in our data set, but this could be a function input
    # if we wanted to extend to future use cases
    for m in range(1, 77):
        # Subset the training data into an X and y for each ts_index and at the provided lag level
        y_train_sub = train_data.query("ts_index==@m").iloc[lag:,0]
        X_train_sub = train_data.query("ts_index==@m").iloc[lag:,0:(lag+1)].iloc[:,1:]
        # Compute the mean of the target for scaling the perf metrics
        train_mean_sub = np.mean(y_train_sub)
        
        # Do the same for the validation data
        y_val_sub = val_data.query("ts_index==@m").iloc[:,0]
        X_val_sub = val_data.query("ts_index==@m").iloc[:,1:(lag+1)]
        val_mean_sub = np.mean(y_val_sub)
        
        # Make predictions with the provided model for both training and validation sets
        train_preds_sub = model.predict(X_train_sub)
        val_preds_sub = model.predict(X_val_sub)
    
        # Compute the rmse on the training data
        train_rmse_sub = mean_squared_error(y_train_sub, train_preds_sub, squared=False)
        # Append the rmse to the appropriate list
        train_rmse_sub_list.append(train_rmse_sub)
        # Append the rmse divided by the target mean to the appropriate list - this is nrmse metric
        train_rmse_scaled_sub_list.append(train_rmse_sub/train_mean_sub)
        
        # Do the same for mae
        train_mae_sub = mean_absolute_error(y_train_sub, train_preds_sub)
        train_mae_sub_list.append(train_mae_sub)
        train_mae_scaled_sub_list.append(train_mae_sub/train_mean_sub)
        
        # Do the same for validation rmse and mae
        val_rmse_sub = mean_squared_error(y_val_sub, val_preds_sub, squared=False)
        val_rmse_sub_list.append(val_rmse_sub)
        val_rmse_scaled_sub_list.append(val_rmse_sub/val_mean_sub)
        
        val_mae_sub = mean_absolute_error(y_val_sub, val_preds_sub)
        val_mae_sub_list.append(val_mae_sub)
        val_mae_scaled_sub_list.append(val_mae_sub/val_mean_sub)
    
    # Create a dictionary to hold average model performance, computing the mean of each of the above
    # lists of model performance 
    perf_dict = {"train_rmse": np.mean(train_rmse_sub_list),
                 "train_mae": np.mean(train_mae_sub_list),
                 "train_nrmse": np.mean(train_rmse_scaled_sub_list),
                 "train_smae": np.mean(train_mae_scaled_sub_list),
                 
                 "val_rmse": np.mean(val_rmse_sub_list),
                 "val_mae": np.mean(val_mae_sub_list),
                 "val_nrmse": np.mean(val_rmse_scaled_sub_list),
                 "val_smae": np.mean(val_mae_scaled_sub_list),
                 "lag": lag
                }
    
    # Return average model performance dictionary
    return perf_dict

In [29]:
# Again, in parallel, loop through the lag embeddings and save to a list
with tqdm_joblib(tqdm(desc="Lag Embed LGBM Perf", 
                      total=len(lag_embed_list))) as progress_bar:
    lag_embed_perf = Parallel(n_jobs=4)(delayed(train_val_lgbm_perf_lag)(lag_embed_mods[i],
                                                                         lag_embed_list[i], 
                                                                         train_df_full,
                                                                         val_df_full
                                                                        ) for i in range(len(lag_embed_list)))

Lag Embed LGBM Perf: 100%|██████████████████████| 21/21 [01:59<00:00,  5.71s/it]


In [30]:
# Create a performance data frame from the list of performance dictionaries created in the last cell
lag_embed_perf_df = pd.DataFrame(lag_embed_perf)

In [31]:
lag_embed_perf_df.head()

Unnamed: 0,train_rmse,train_mae,train_nrmse,train_smae,val_rmse,val_mae,val_nrmse,val_smae,lag
0,39.455627,26.581157,0.183802,0.125454,39.178294,26.560165,0.177646,0.122029,1
1,38.569484,26.173492,0.180307,0.12321,38.557208,26.254801,0.175321,0.120324,2
2,36.629872,24.732698,0.175332,0.119291,36.694092,24.828349,0.170318,0.116341,4
3,34.352365,23.090392,0.16748,0.113849,34.743085,23.320557,0.163101,0.111149,24
4,33.376031,22.418565,0.163209,0.110903,33.844988,22.709957,0.159242,0.108513,48


In [32]:
# Save to a file for some inspection/plotting in R
lag_embed_perf_df.to_csv('Results/Global/LightGBM Default/lag_model_performance.csv', index=False)

# Train and Test Global Model on Full Data Set

In [33]:
# Set our final lag value to be 840
lag_n = 840

In [34]:
# Create a full X and y training set (including validation) using 840 lags
y_train = train_val_df_full.iloc[:,0:(lag_n+1)].dropna().iloc[:,0]
X_train = train_val_df_full.iloc[:,0:(lag_n+1)].dropna().iloc[:,1:]

In [35]:
# Create and fit a model to these training data
mod = LGBMRegressor(boosting_type='goss', random_state=54321)  
mod.fit(X_train,y_train)

LGBMRegressor(boosting_type='goss', random_state=54321)

In [36]:
# Save model to file to use later
filename = 'Results/Global/LightGBM Default/Full/model'
joblib.dump(mod, filename)

['Results/Global/LightGBM Default/Full/model']

In [37]:
# Function to compute model residuals to use for bootstrapping PIs
def compute_lgbm_residuals(mod, data, lag_n):
    """Function which takes inputs: a model, the data it was trained on, and a lag embedding,
    and outputs a list of model residuals"""
    
    # Create X and y matrices from the data
    X = data.iloc[:,0:(lag_n+1)].dropna().iloc[:,1:]
    y = data.iloc[:,0:(lag_n+1)].dropna().iloc[:,0]
    
    # Predict the y values for the given X
    pred = mod.predict(X)
    
    # Compute the residuals as the difference between true and predicted and convert to a list
    resid = (y - pred).to_list()
    
    # Retrun the list of residuals
    return resid

In [38]:
# Compute full model residuals using the above function
full_mod_resid = compute_lgbm_residuals(mod, train_val_df_full, lag_n)

In [39]:
# Save the residuals to a df
resid_df = pd.DataFrame({"residual": full_mod_resid})

In [40]:
# Save the residual df to a file
resid_df.to_csv("Results/Global/LightGBM Default/Full/residual.csv", index=False)

In [41]:
# Function to compute test preds
def compute_lgbm_test_preds(mod, data, lag_n):
    """Function which takes in: a model, test data, and the lag embedding to use, and returns a df of forecasts"""

    # Initialize an empty data frame to store preds
    pred_df = pd.DataFrame()
    
    # Loop through each individual time series index in the data set
    for ts_idx in data.ts_index.unique():
        # Create the X matrix for each one
        X = data.query("ts_index==@ts_idx").iloc[:,1:(lag_n+1)].copy()

        # Forecast for that X matrix
        preds = mod.predict(X)
        
        # Save the results to a temp data frame
        pred_df_sub = pd.DataFrame({"ts_index": ts_idx, "test_preds": preds})
        
        # Append to primary data frame
        pred_df = pred_df.append(pred_df_sub)
    
    # Return df of all preds with corresponding ts_index column
    return pred_df

In [42]:
# Compute full data test preds using above function
full_mod_test_preds = compute_lgbm_test_preds(mod, test_df_full, lag_n)

In [43]:
# Function to compute performance metrics on test data
def compute_lgbm_test_perf(preds, data):
    """Function which takes inputs: a data frame of test predictions, and a test data df,
    and which returns a data frame of model performance"""
    
    # Create an empty list to store model performance
    perf_ls = list()
    
    # For each time series index in our data set
    for ts_idx in data.ts_index.unique():
        # Get the target (actual) for that index
        y_sub = data.query("ts_index==@ts_idx").iloc[:,0]
        # Extract the corresponding forecasts
        preds_sub = preds.query("ts_index==@ts_idx").test_preds
        
        # Compute rmse, mae, and the mean of the true target value for those preds
        rmse_sub = mean_squared_error(y_sub, preds_sub, squared=False)
        mae_sub = mean_absolute_error(y_sub, preds_sub)
        mean_sub = np.mean(y_sub)
        
        # Save those metrics to a dictionary
        pred_dict = {"rmse": rmse_sub, "mae": mae_sub, "mean": mean_sub}
        
        # Append the dictionary to the list
        perf_ls.append(pred_dict)
        
    # Return a data frame of model performance created from the list of dictionaries
    return pd.DataFrame(perf_ls)

In [44]:
# Compute model perf metrics using above function
full_mod_test_perf = compute_lgbm_test_perf(full_mod_test_preds, test_df_full)

# Compute scaled performance metrics in new columns
full_mod_test_perf['nrmse'] = full_mod_test_perf['rmse']/full_mod_test_perf['mean']
full_mod_test_perf['smae'] = full_mod_test_perf['mae']/full_mod_test_perf['mean']

In [45]:
# Print the means of model perf metrics
full_mod_test_perf.mean()

rmse      31.303586
mae       20.764927
mean     265.435072
nrmse      0.142768
smae       0.096702
dtype: float64

In [46]:
# Function to compute pred intervals with bootstrap method
def compute_lgbm_boostrap_int(preds, resid, n_boot):
    """Function which takes in a model's predictions and residuals, and a number of bootstrap resamples to use,
    and which outputs a df with pred intervals at 80% and 95%"""
    
    # Set seeds for reproducibility
    random.seed(54321)
    np.random.seed(54321)
    
    # Create empty columns in the pred df to store the PIs
    preds['lo_95'] = np.nan
    preds['hi_95'] = np.nan
    preds['lo_80'] = np.nan
    preds['hi_80'] = np.nan
    
    # For each row in the pred df
    for n in range(preds.shape[0]):
        # Sample with replacement n_boot times from the residuals
        resid_boot = np.random.choice(resid, size=n_boot, replace=True)
        # Extract the forecast value for that row
        pred_n = preds.iloc[n, :].test_preds
        # Add the residual vector to the forecast value
        pred_n_boot = resid_boot + pred_n
        
        # Compute quantiles of this residual+forecast vector
        percent_95_lo = np.percentile(pred_n_boot, 2.5)
        percent_95_hi = np.percentile(pred_n_boot, 97.5)
        
        percent_80_lo = np.percentile(pred_n_boot, 10)
        percent_80_hi = np.percentile(pred_n_boot, 90)
        
        # Save these quantiles to the appropriate df column
        preds.iloc[n, 2] = percent_95_lo
        preds.iloc[n, 3] = percent_95_hi
        preds.iloc[n, 4] = percent_80_lo
        preds.iloc[n, 5] = percent_80_hi
    
    # Return the updated preds data frame
    return preds

In [47]:
# Compute PIs with 1000 bootstrap samples
full_mod_boot_ints = compute_lgbm_boostrap_int(full_mod_test_preds, full_mod_resid, 1000)

In [48]:
# Add the true values into their own df column
full_mod_boot_ints['actual'] = test_df_full.iloc[:,0].to_list()

In [49]:
full_mod_boot_ints.head()

Unnamed: 0,ts_index,test_preds,lo_95,hi_95,lo_80,hi_80,actual
0,1,325.434391,268.889593,387.638256,299.369989,354.752278,320.0
1,1,326.44285,269.158972,388.523561,296.301457,355.085566,339.0
2,1,343.486459,282.590215,411.880372,312.747231,372.239508,349.0
3,1,347.672272,287.013922,412.413097,319.303457,378.704519,343.0
4,1,343.833946,278.444722,404.274194,310.302711,373.20713,343.0


In [50]:
# Create a function to compute the interval score
def interval_score(true_values, lower, upper, interval_range):
    """ Function which takes in the true values, the upper and lower bounds of PIs, and the PI level (e.g., 90%)
        and from these inputs, computes the interval score for each prediction
    """
    
    # Compute alpha from the interval range
    alpha = 1-interval_range
    
    # Save the upper, lower, and true_values as numpy arrays for computation purposes
    upper = np.array(upper)
    lower = np.array(lower)
    true_values = np.array(true_values)
    
    # Compute the lower component of the interval score - just a boolean for true below interval
    def lower_ind(true,low):
        if true<low:
            return 1
        else:
            return 0
        
    # Computer the upper component of the interval score - similar boolean for true above interval
    def upper_ind(true,up):
        if true>up:
            return 1
        else:
            return 0
        
    # Computer the actual score for each obsveration - formula here: https://epiforecasts.io/scoringutils/reference/interval_score.html
    scores = (upper-lower) + (2/alpha)*(lower-true_values)*(lower > true_values) + (2/alpha)*(true_values-upper)*(true_values > upper)
    
    # Return the scores array
    return scores

In [51]:
# Compute the 95% and 80% PI scores using the above function as new data frame columns
full_mod_boot_ints['int_95_score'] = interval_score(full_mod_boot_ints.actual, 
                                                    full_mod_boot_ints.lo_95,
                                                    full_mod_boot_ints.hi_95,
                                                    0.95)
                                                    
full_mod_boot_ints['int_80_score'] = interval_score(full_mod_boot_ints.actual, 
                                                    full_mod_boot_ints.lo_80,
                                                    full_mod_boot_ints.hi_80,
                                                    0.80)

In [52]:
# Print the means of the interval scores
full_mod_boot_ints.mean()

ts_index         38.500000
test_preds      265.875193
lo_95           201.657780
hi_95           332.379614
lo_80           235.797174
hi_80           296.879261
actual          265.435072
int_95_score    228.236174
int_80_score    124.369523
dtype: float64

In [53]:
# Group the PI df by time series and compute the scaled interval scores
full_mod_boot_ints_group = full_mod_boot_ints.groupby("ts_index")\
.agg({'int_95_score':'mean', 'int_80_score':'mean', 'actual':'mean'}).reset_index()

full_mod_boot_ints_group['int_95_score_scaled'] = full_mod_boot_ints_group['int_95_score']/full_mod_boot_ints_group['actual']
full_mod_boot_ints_group['int_80_score_scaled'] = full_mod_boot_ints_group['int_80_score']/full_mod_boot_ints_group['actual']

In [54]:
# Print the scaled interval score averages
full_mod_boot_ints_group[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.625180
int_95_score_scaled    1.166136
dtype: float64

In [55]:
# Save to csv file
full_mod_boot_ints.to_csv("Results/Global/LightGBM Default/Full/test_pred_intervals.csv", index=False)

# Train and Test on Random Clusters

In [56]:
# Read in cluster assignments for random clusters
rand_clust = pd.read_csv("Results/Clustering/Random/random_clustering_assign.csv")

In [57]:
# Create a field called cluster with the cluster assignments (for simplicity later on)
rand_clust['cluster'] = rand_clust['random_clust_assign']

In [58]:
# Function to train a LightGBM model on data with a cluster assignment
def train_lgbm_clust(data, cluster_no, lag_n):
    """Function takes in: data to train on, the cluster number to use, and the lag_n lag embedding to use.
    Function returns the trained model. """
    
    # Create X and y to train model by filtering to the appropriate cluster number and lag_embedding
    X_train = data.query("cluster==@cluster_no").copy().iloc[:,0:(lag_n+1)].dropna().iloc[:,1:]
    y_train = data.query("cluster==@cluster_no").copy().iloc[:,0:(lag_n+1)].dropna().iloc[:,0]
    
    # Create and train the model
    mod = LGBMRegressor(boosting_type='goss', random_state=54321)  
    mod.fit(X_train,y_train)
    
    # Return the fitted model
    return mod

In [59]:
# Merge the training and test data with the cluster assignements (essentially join) on ts_index
train_val_df_full_rand_clust = train_val_df_full.merge(rand_clust, on="ts_index")
test_df_full_rand_clust = test_df_full.merge(rand_clust, on="ts_index")

In [60]:
# Parallel loop through the cluster assignments and create the models
with tqdm_joblib(tqdm(desc="Random Cluster LGBM Models", 
                      total=len(rand_clust.cluster.unique()))) as progress_bar:
    rand_clust_mods = Parallel(n_jobs=2)(delayed(train_lgbm_clust)(train_val_df_full_rand_clust, 
                                                                   i,
                                                                   lag_n
                                                                  ) for i in range(1, len(rand_clust.cluster.unique())+1))

Random Cluster LGBM Models: 100%|█████████████████| 4/4 [00:36<00:00,  9.09s/it]


In [61]:
# Save the models to the appropriate directory using joblib.dump
for clust_no in range(1, len(rand_clust.cluster.unique())+1):
    filename = f'Results/Global/LightGBM Default/Random Cluster/model_{clust_no}'
    joblib.dump(rand_clust_mods[clust_no-1], filename)

In [62]:
# Load saved models into a list so that we do not need to retrain them if we are working with them at a later time

# Create an empty list
rand_clust_mods = list()

# For each cluster, load that cluster's model and append to list
for mod_no in range(1, len(rand_clust.cluster.unique())+1):
    rand_clust_mods.append(joblib.load(f'Results/Global/LightGBM Default/Random Cluster/model_{mod_no}'))

In [63]:
# Function to compute model residuals for clustered data
def compute_lgbm_resid_clust(mod, cluster_no, data, lag_n):
    """Function which takes in a trained model, cluster number, training data, and lag embedding level
    and which returns a list of model residuals"""
    
    # Create X and y from the data
    X_train = data.query("cluster==@cluster_no").copy().iloc[:,0:(lag_n+1)].dropna().iloc[:,1:]
    y_train = data.query("cluster==@cluster_no").copy().iloc[:,0:(lag_n+1)].dropna().iloc[:,0]
    
    # Make predictions
    pred = mod.predict(X_train)
    
    # Compute residuals and convert to list
    resid = (y_train - pred).to_list()
    
    # Return list of residuals
    return resid

In [64]:
# Parallel loop through models and compute residuals for each
with tqdm_joblib(tqdm(desc="Random Cluster LGBM Residuals", 
                      total=len(rand_clust.cluster.unique()))) as progress_bar:
    rand_clust_residuals = Parallel(n_jobs=2)(delayed(compute_lgbm_resid_clust)(rand_clust_mods[i-1], 
                                                                                i,
                                                                                train_val_df_full_rand_clust,
                                                                                lag_n
                                                                               ) for i in range(1, len(rand_clust.cluster.unique())+1))

Random Cluster LGBM Residuals: 100%|██████████████| 4/4 [00:11<00:00,  2.95s/it]


In [65]:
rand_clust_res_df = pd.DataFrame({'cluster': list({(i+1): rand_clust_residuals[i] for i in range(len(rand_clust_residuals))}.keys()),
                                  'residual': list({(i+1): rand_clust_residuals[i] for i in range(len(rand_clust_residuals))}.values())})

rand_clust_res_df.to_csv("Results/Global/LightGBM Default/Random Cluster/residual.csv", index=False)

In [66]:
# Function to compute clustered test preds
def compute_lgbm_test_preds_clust(mod, cluster_no, data, lag_n):
    """Function which takes inputs: a trained model, a cluster number, test data, and lag embedding
    and which returns a df of model predictions on the test data"""
    
    # Start by creating an empty data frame
    pred_df = pd.DataFrame()
    
    # Subset the test data to the provided cluster number
    data = data.query("cluster==@cluster_no").copy()
    
    # Loop through all the time series in the cluster
    for ts_idx in data.ts_index.unique():
        
        # Filter to each ts_index
        X = data.query("ts_index==@ts_idx").iloc[:,1:(lag_n+1)].copy()
        
        # Compute predictions for that time series
        preds = mod.predict(X)
        
        # Save the resulds to a temp data frame
        pred_df_sub = pd.DataFrame({"ts_index": ts_idx, "test_preds": preds})
        
        # Append to the primary data frame
        pred_df = pred_df.append(pred_df_sub)
    
    # Return the data frame of model predictions
    return pred_df

In [67]:
# Loop through all the clusters and call the function above to compute test preds
# Again, we use joblib to do this in a parallel fashion and we use the tdqm_joblib function to print a progress bar
with tqdm_joblib(tqdm(desc="Random Cluster LGBM Test Preds", 
                      total=len(rand_clust.cluster.unique()))) as progress_bar:
    rand_clust_test_preds = Parallel(n_jobs=2)(delayed(compute_lgbm_test_preds_clust)(rand_clust_mods[i-1], 
                                                                                      i,
                                                                                      test_df_full_rand_clust,
                                                                                      lag_n
                                                                                     ) for i in range(1, len(rand_clust.cluster.unique())+1))

Random Cluster LGBM Test Preds: 100%|█████████████| 4/4 [00:04<00:00,  1.09s/it]


In [68]:
# Create an empty data frame
rand_clust_test_preds_df = pd.DataFrame()

# For each data frame in the list of prediction data frames
for clust_test_pred_df in rand_clust_test_preds:
    # Append to the newly created data frame
    rand_clust_test_preds_df = rand_clust_test_preds_df.append(clust_test_pred_df)

# Compute model performance for the clustered predictions
rand_clust_test_perf = compute_lgbm_test_perf(rand_clust_test_preds_df,
                                              test_df_full_rand_clust)

In [69]:
rand_clust_test_perf.head()

Unnamed: 0,rmse,mae,mean
0,25.586654,18.727047,259.136161
1,25.986096,17.794514,195.970982
2,24.460902,17.215643,207.25
3,38.779658,24.815757,395.383929
4,45.574871,30.017939,445.119048


In [70]:
# Compute columns for normalized performance
rand_clust_test_perf['nrmse'] = rand_clust_test_perf['rmse']/rand_clust_test_perf['mean']
rand_clust_test_perf['smae'] = rand_clust_test_perf['mae']/rand_clust_test_perf['mean']

In [71]:
# Print the means
rand_clust_test_perf.mean()

rmse      31.510284
mae       20.919646
mean     265.435072
nrmse      0.142858
smae       0.096716
dtype: float64

In [72]:
# Parallel loop through clusters and compute bootstrap PIs - save dfs to a list
with tqdm_joblib(tqdm(desc="Random Cluster LGBM Residuals", 
                      total=len(rand_clust.cluster.unique()))) as progress_bar:
    rand_clust_test_pred_int = Parallel(n_jobs=4)(delayed(compute_lgbm_boostrap_int)(rand_clust_test_preds[i-1], 
                                                                                     rand_clust_residuals[i-1],
                                                                                     1000) for i in range(1, len(rand_clust.cluster.unique())+1))

Random Cluster LGBM Residuals: 100%|██████████████| 4/4 [03:17<00:00, 49.28s/it]


In [73]:
# For each cluster
for n in range(1, len(rand_clust_test_pred_int)+1):
    
    # Get the actual y values for that cluster
    y_actual_sub = test_df_full_rand_clust.query("cluster==@n").copy().iloc[:,0].to_list()
    
    # Add the actual values to the data frame of PIs
    rand_clust_test_pred_int[n-1]['actual'] = y_actual_sub

In [74]:
# Again, for each cluster
for m in range(len(rand_clust_test_pred_int)):
    # Compute the interval scores at 95% and 80% as new df columns
    rand_clust_test_pred_int[m]['int_95_score'] = interval_score(rand_clust_test_pred_int[m]['actual'],
                                                                 rand_clust_test_pred_int[m]['lo_95'],
                                                                 rand_clust_test_pred_int[m]['hi_95'],
                                                                 0.95
                                                                )

    rand_clust_test_pred_int[m]['int_80_score'] = interval_score(rand_clust_test_pred_int[m]['actual'],
                                                                 rand_clust_test_pred_int[m]['lo_80'],
                                                                 rand_clust_test_pred_int[m]['hi_80'],
                                                                 0.80
                                                                )

In [75]:
# Append all PI data frames into one by first creating an empty df and then looping through the list of PI dfs
rand_clust_test_pred_int_df = pd.concat(rand_clust_test_pred_int)

In [76]:
rand_clust_test_pred_int_df.mean()

ts_index         38.500000
test_preds      265.759651
lo_95           206.032728
hi_95           327.081947
lo_80           236.327738
hi_80           296.040683
actual          265.435072
int_95_score    226.430963
int_80_score    124.122031
dtype: float64

In [77]:
# Group the pred int dataframe by time series, and compute the mean of the scores and true values
rand_clust_test_pred_int_df_grouped = rand_clust_test_pred_int_df.groupby("ts_index")\
.agg({'int_95_score':'mean', 'int_80_score':'mean', 'actual':'mean'}).reset_index()

# Compute scaled interval scores
rand_clust_test_pred_int_df_grouped['int_95_score_scaled'] = rand_clust_test_pred_int_df_grouped['int_95_score']/rand_clust_test_pred_int_df_grouped['actual']
rand_clust_test_pred_int_df_grouped['int_80_score_scaled'] = rand_clust_test_pred_int_df_grouped['int_80_score']/rand_clust_test_pred_int_df_grouped['actual']

In [78]:
# Print the scaled interval scores
rand_clust_test_pred_int_df_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.603182
int_95_score_scaled    1.079982
dtype: float64

In [79]:
# Save PI df to csv
rand_clust_test_pred_int_df.to_csv("Results/Global/LightGBM Default/Random Cluster/test_pred_intervals.csv", 
                                   index=False)

# Train and Test per Highway System

In [80]:
# Delete variables no longer needed
del rand_clust_test_pred_int_df
del rand_clust_test_pred_int
del y_actual_sub
del rand_clust_test_perf
del rand_clust_test_preds
del rand_clust_residuals
del rand_clust_mods
del train_val_df_full_rand_clust
del test_df_full_rand_clust
del rand_clust
del resid_df
del rand_clust_res_df

In [81]:
# Garbage collect
gc.collect()

781

In [82]:
# Create a data frame of highway system clusters based on the number of files we have for each highway system
highway_system_clust = pd.DataFrame({"ts_index": np.arange(1, 77),
                                    "cluster": [1]*38 + [2]*19 + [3]*19}
                                   )

In [83]:
# Merge the training and test data with the cluster assignments 
train_val_df_full_highway_clust = train_val_df_full.merge(highway_system_clust, on="ts_index")
test_df_full_highway_clust = test_df_full.merge(highway_system_clust, on="ts_index")

In [84]:
# Parallel loop through the clusters to train the models and save trained models to a list
with tqdm_joblib(tqdm(desc="Highway System LGBM Models", 
                      total=len(highway_system_clust.cluster.unique()))) as progress_bar:
    highway_clust_mods = Parallel(n_jobs=3)(delayed(train_lgbm_clust)(train_val_df_full_highway_clust, 
                                                                      i,
                                                                      lag_n
                                                                     ) for i in range(1, len(highway_system_clust.cluster.unique())+1))

Highway System LGBM Models: 100%|█████████████████| 3/3 [00:35<00:00, 11.82s/it]


In [85]:
# Save models to files using joblib.dump
for clust_no in range(1, len(highway_system_clust.cluster.unique())+1):
    filename = f'Results/Global/LightGBM Default/Highway System/model_{clust_no}'
    joblib.dump(highway_clust_mods[clust_no-1], filename)

In [86]:
# Load models from files into a list using joblib.load
highway_clust_mods = list()
for clust_no in range(1, len(highway_system_clust.cluster.unique())+1):
    filename = f'Results/Global/LightGBM Default/Highway System/model_{clust_no}'
    highway_clust_mods.append(joblib.load(filename))

In [87]:
# Loop through the clusters and compute the residuals - save to a list
with tqdm_joblib(tqdm(desc="Highway System LGBM Residuals", 
                      total=len(highway_system_clust.cluster.unique()))) as progress_bar:
    highway_clust_residuals = Parallel(n_jobs=3)(delayed(compute_lgbm_resid_clust)(highway_clust_mods[i-1],
                                                                                   i,
                                                                                   train_val_df_full_highway_clust,
                                                                                   lag_n
                                                                                  ) for i in range(1, len(highway_system_clust.cluster.unique())+1))

Highway System LGBM Residuals: 100%|██████████████| 3/3 [00:09<00:00,  3.26s/it]


In [88]:
highway_res_df = pd.DataFrame({'cluster': list({(i+1): highway_clust_residuals[i] for i in range(len(highway_clust_residuals))}.keys()),
                               'residual': list({(i+1): highway_clust_residuals[i] for i in range(len(highway_clust_residuals))}.values())})

highway_res_df.to_csv("Results/Global/LightGBM Default/Highway System/residual.csv", index=False)

In [89]:
highway_res_df.head()

Unnamed: 0,cluster,residual
0,1,"[-15.010870531107003, -1.5236122590959553, -49..."
1,2,"[28.376722820189144, -13.392492828881444, 19.8..."
2,3,"[-47.25459788313378, -15.637389705863654, -0.1..."


In [90]:
# Loop through the clusters and compute test predictions and save df's to a list
with tqdm_joblib(tqdm(desc="Highway System LGBM Test Preds", 
                      total=len(highway_system_clust.cluster.unique()))) as progress_bar:
    highway_clust_test_preds = Parallel(n_jobs=3)(delayed(compute_lgbm_test_preds_clust)(highway_clust_mods[i-1],
                                                                                         i,
                                                                                         test_df_full_highway_clust,
                                                                                         lag_n
                                                                                        ) for i in range(1, len(highway_system_clust.cluster.unique())+1))

Highway System LGBM Test Preds: 100%|█████████████| 3/3 [00:03<00:00,  1.12s/it]


In [91]:
# Concat all data frames of test preds into one
highway_clust_test_preds_df = pd.concat(highway_clust_test_preds)

# Compute test set model performance
highway_clust_test_perf = compute_lgbm_test_perf(highway_clust_test_preds_df,
                                                 test_df_full_highway_clust)

In [92]:
# Compute scaled metrics a new df columns
highway_clust_test_perf['nrmse'] = highway_clust_test_perf['rmse']/highway_clust_test_perf['mean']
highway_clust_test_perf['smae'] = highway_clust_test_perf['mae']/highway_clust_test_perf['mean']

In [93]:
# Print the mean of model performance
highway_clust_test_perf.mean()

rmse      31.133296
mae       20.659435
mean     265.435072
nrmse      0.141038
smae       0.095516
dtype: float64

In [94]:
# Loop through the clusters and compute test set PIs, saving to a list of dataframes 
with tqdm_joblib(tqdm(desc="Highway System LGBM PI", 
                      total=len(highway_system_clust.cluster.unique()))) as progress_bar:
    highway_clust_test_pred_int = Parallel(n_jobs=3)(delayed(compute_lgbm_boostrap_int)(highway_clust_test_preds[i-1],
                                                                                        highway_clust_residuals[i-1],
                                                                                        1000) for i in range(1, len(highway_system_clust.cluster.unique())+1))

Highway System LGBM PI: 100%|████████████████████| 3/3 [11:55<00:00, 238.48s/it]


In [95]:
# For each cluster
for n in range(1, len(highway_clust_test_pred_int)+1):
    
    # Get the true values
    y_actual_sub = test_df_full_highway_clust.query("cluster==@n").copy().iloc[:,0].to_list()
    
    # Add these as a column to the corresponding df of test pred PIs
    highway_clust_test_pred_int[n-1]['actual'] = y_actual_sub

In [96]:
# For each cluster's PIs
for m in range(len(highway_clust_test_pred_int)):
    # Computer the 80% and 95% interval scores
    highway_clust_test_pred_int[m]['int_95_score'] = interval_score(highway_clust_test_pred_int[m]['actual'],
                                                                    highway_clust_test_pred_int[m]['lo_95'],
                                                                    highway_clust_test_pred_int[m]['hi_95'],
                                                                    0.95
                                                                   )

    highway_clust_test_pred_int[m]['int_80_score'] = interval_score(highway_clust_test_pred_int[m]['actual'],
                                                                    highway_clust_test_pred_int[m]['lo_80'],
                                                                    highway_clust_test_pred_int[m]['hi_80'],
                                                                    0.80
                                                                   )

In [97]:
# Create one data frame from all test pred PI data frames
highway_clust_test_pred_int_df = pd.concat(highway_clust_test_pred_int)

In [98]:
# Print the mean interval scores
highway_clust_test_pred_int_df.mean()

ts_index         38.500000
test_preds      265.698815
lo_95           205.355121
hi_95           328.253474
lo_80           236.322297
hi_80           296.228465
actual          265.435072
int_95_score    221.270179
int_80_score    122.291010
dtype: float64

In [99]:
# Group by time series and compute the mean of the interval scores and true values
highway_clust_test_pred_int_df_grouped = highway_clust_test_pred_int_df.groupby("ts_index")\
.agg({'int_95_score':'mean', 'int_80_score':'mean', 'actual':'mean'}).reset_index()

# Compute scaled interval scores
highway_clust_test_pred_int_df_grouped['int_95_score_scaled'] = highway_clust_test_pred_int_df_grouped['int_95_score']/highway_clust_test_pred_int_df_grouped['actual']
highway_clust_test_pred_int_df_grouped['int_80_score_scaled'] = highway_clust_test_pred_int_df_grouped['int_80_score']/highway_clust_test_pred_int_df_grouped['actual']

In [100]:
# Print the scaled interval scores
highway_clust_test_pred_int_df_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.589854
int_95_score_scaled    1.058165
dtype: float64

In [101]:
# Save the PI data frame to a file
highway_clust_test_pred_int_df.to_csv("Results/Global/LightGBM Default/Highway System/test_pred_intervals.csv", 
                                      index=False)

# Train and Test - Catch22 KMeans Clusters

In [102]:
# Delete unnecessary variables to save on RAM
del highway_clust_test_pred_int_df
del highway_clust_test_pred_int
del highway_res_df
del y_actual_sub
del highway_clust_test_perf
del highway_clust_test_preds
del highway_clust_residuals
del highway_clust_mods
del train_val_df_full_highway_clust
del test_df_full_highway_clust
del highway_system_clust

In [103]:
# Force garabage collect
gc.collect()

3

In [104]:
# Read in cluster assignments from Catch22-based clusters
catch22_clust = pd.read_csv("Results/Clustering/KMeans/kmeans_catch22_clustering_assign.csv")
# Rename the field to "cluster" to match expectations from above functions
catch22_clust['cluster'] = catch22_clust['kmeans_catch22_clust_assign']

In [105]:
# Merge the training and test data with the cluster assignments
train_val_df_full_catch22_clust = train_val_df_full.merge(catch22_clust, on="ts_index")
test_df_full_catch22_clust = test_df_full.merge(catch22_clust, on="ts_index")

In [106]:
# For each cluster, loop through in a parallel way and train a light gbm model - save to list of models
with tqdm_joblib(tqdm(desc="Catch22 LGBM Models", 
                      total=len(catch22_clust.cluster.unique()))) as progress_bar:
    catch22_clust_mods = Parallel(n_jobs=3)(delayed(train_lgbm_clust)(train_val_df_full_catch22_clust, 
                                                                      i,
                                                                      lag_n
                                                                     ) for i in range(1, len(catch22_clust.cluster.unique())+1))

Catch22 LGBM Models: 100%|████████████████████████| 5/5 [00:36<00:00,  7.32s/it]


In [107]:
# For each model, save the models to files using joblib.dump for future use
for clust_no in range(1, len(catch22_clust.cluster.unique())+1):
    filename = f'Results/Global/LightGBM Default/Catch22 KMeans/model_{clust_no}'
    joblib.dump(catch22_clust_mods[clust_no-1], filename)

In [108]:
# For each model, load the model from file into a list using joblib.load

# Create an empty list
catch22_clust_mods = list()

# Loop through the number of clusters, load the model, and append to the list
for clust_no in range(1, len(catch22_clust.cluster.unique())+1):
    filename = f'Results/Global/LightGBM Default/Catch22 KMeans/model_{clust_no}'
    catch22_clust_mods.append(joblib.load(filename))

In [109]:
# For each model, loop through in a parallel fashion and compute model residuals. Save each model's residuals
# to the list called catch22_clust_residuals
with tqdm_joblib(tqdm(desc="Catch22 LGBM Residuals", 
                      total=len(catch22_clust.cluster.unique()))) as progress_bar:
    catch22_clust_residuals = Parallel(n_jobs=3)(delayed(compute_lgbm_resid_clust)(catch22_clust_mods[i-1],
                                                                                   i,
                                                                                   train_val_df_full_catch22_clust,
                                                                                   lag_n
                                                                                  ) for i in range(1, len(catch22_clust.cluster.unique())+1))

Catch22 LGBM Residuals: 100%|█████████████████████| 5/5 [00:09<00:00,  1.97s/it]


In [110]:
# Create the data frame of residuals, using comprehensions. Each row will be a cluster number and a list of 
# model residuals for that cluster's model
catch22_res_df = pd.DataFrame({'cluster': list({(i+1): catch22_clust_residuals[i] for i in range(len(catch22_clust_residuals))}.keys()),
'residual': list({(i+1): catch22_clust_residuals[i] for i in range(len(catch22_clust_residuals))}.values())})

# Examine the residual data frame
catch22_res_df.head()

Unnamed: 0,cluster,residual
0,1,"[0.9643008794417725, -1.2605923422304315, -7.1..."
1,2,"[3.664500161840323, -19.625317840160847, -7.62..."
2,3,"[-0.10184140214194581, -1.1317853952585195, -0..."
3,4,"[-6.5148478239201495, 23.22633531123006, -38.5..."
4,5,"[19.04927975710811, 19.1956187718352, 9.408641..."


In [111]:
# Save the residual data frame to a file
catch22_res_df.to_csv("Results/Global/LightGBM Default/Catch22 KMeans/residual.csv", index=False)

In [112]:
# Parallel loop through each model and compute predictions on the test set. Add the df of preds to a list
with tqdm_joblib(tqdm(desc="Catch22 LGBM Test Preds", 
                      total=len(catch22_clust.cluster.unique()))) as progress_bar:
    catch22_clust_test_preds = Parallel(n_jobs=3)(delayed(compute_lgbm_test_preds_clust)(catch22_clust_mods[i-1],
                                                                                         i,
                                                                                         test_df_full_catch22_clust,
                                                                                         lag_n
                                                                                        ) for i in range(1, len(catch22_clust.cluster.unique())+1))

Catch22 LGBM Test Preds: 100%|████████████████████| 5/5 [00:03<00:00,  1.46it/s]


In [113]:
# Create a data frame of test preds by concatenating all data frames in the above list into one
catch22_clust_test_preds_df = pd.concat(catch22_clust_test_preds)

# Use this data frame of all test preds to compute test pred performance
catch22_clust_test_perf = compute_lgbm_test_perf(catch22_clust_test_preds_df,
                                                 test_df_full_catch22_clust)

In [114]:
# Create columns of normalized rmse and scaled mae in our performance data frame
catch22_clust_test_perf['nrmse'] = catch22_clust_test_perf['rmse']/catch22_clust_test_perf['mean']
catch22_clust_test_perf['smae'] = catch22_clust_test_perf['mae']/catch22_clust_test_perf['mean']

In [115]:
# Print the means of the performance metrics
catch22_clust_test_perf.mean()

rmse      31.290422
mae       20.745887
mean     265.435072
nrmse      0.140943
smae       0.095138
dtype: float64

In [116]:
# For each set of residuals and test preds, create bootstrap prediction intervals via parallel for loop
# These intervals are saved in a df, and the output here is a list of those data frames
with tqdm_joblib(tqdm(desc="Catch22 LGBM PI", 
                      total=len(catch22_clust.cluster.unique()))) as progress_bar:
    catch22_clust_test_pred_int = Parallel(n_jobs=3)(delayed(compute_lgbm_boostrap_int)(catch22_clust_test_preds[i-1],
                                                                                        catch22_clust_residuals[i-1],
                                                                                        1000) for i in range(1, len(catch22_clust.cluster.unique())+1))

Catch22 LGBM PI: 100%|███████████████████████████| 5/5 [11:15<00:00, 135.19s/it]


In [117]:
# Looping through each individual cluster
for n in range(1, len(catch22_clust_test_pred_int)+1):
    # Extract the true values for the target variable for that cluster
    y_actual_sub = test_df_full_catch22_clust.query("cluster==@n").copy().iloc[:,0].to_list()
    # Add those to the data frame of prediction intervals for that cluster
    catch22_clust_test_pred_int[n-1]['actual'] = y_actual_sub

In [118]:
# Loop through each data frame of prediction intervals 
for m in range(len(catch22_clust_test_pred_int)):
    # Computer the 95% PI score
    catch22_clust_test_pred_int[m]['int_95_score'] = interval_score(catch22_clust_test_pred_int[m]['actual'],
                                                                    catch22_clust_test_pred_int[m]['lo_95'],
                                                                    catch22_clust_test_pred_int[m]['hi_95'],
                                                                    0.95
                                                                   )

    # Compute the 80% PI score
    catch22_clust_test_pred_int[m]['int_80_score'] = interval_score(catch22_clust_test_pred_int[m]['actual'],
                                                                    catch22_clust_test_pred_int[m]['lo_80'],
                                                                    catch22_clust_test_pred_int[m]['hi_80'],
                                                                    0.80
                                                                   )

In [119]:
# Append all PI data frames into one data frame
catch22_clust_test_pred_int_df = pd.concat(catch22_clust_test_pred_int)

In [120]:
# Print the means of the PI scores
catch22_clust_test_pred_int_df.mean()

ts_index         38.500000
test_preds      265.589308
lo_95           207.437175
hi_95           325.279309
lo_80           236.197707
hi_80           295.728206
actual          265.435072
int_95_score    217.156534
int_80_score    120.536336
dtype: float64

In [121]:
# Group the PI df by time series and compute mean scores and true value mean
catch22_clust_test_pred_int_df_grouped = catch22_clust_test_pred_int_df.groupby("ts_index")\
.agg({'int_95_score': 'mean', 'int_80_score':'mean', 'actual':'mean'}).reset_index()

# Compute scaled interval scores for each time series
catch22_clust_test_pred_int_df_grouped['int_95_score_scaled'] = catch22_clust_test_pred_int_df_grouped['int_95_score']/catch22_clust_test_pred_int_df_grouped['actual']
catch22_clust_test_pred_int_df_grouped['int_80_score_scaled'] = catch22_clust_test_pred_int_df_grouped['int_80_score']/catch22_clust_test_pred_int_df_grouped['actual']

In [122]:
# Print the average scaled interval score
catch22_clust_test_pred_int_df_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.548497
int_95_score_scaled    0.959561
dtype: float64

In [123]:
# Save the prediction interval df to a csv file
catch22_clust_test_pred_int_df.to_csv("Results/Global/LightGBM Default/Catch22 KMeans/test_pred_intervals.csv", 
                                      index=False)

# Train and Test - TSFeat KMeans Clusters

In [124]:
# Delete variables which are no longer needed
del catch22_clust_test_pred_int_df
del catch22_clust_test_pred_int
del y_actual_sub
del catch22_clust_test_perf
del catch22_clust_test_preds
del catch22_clust_residuals
del catch22_clust_mods
del train_val_df_full_catch22_clust
del test_df_full_catch22_clust
del catch22_clust
del catch22_res_df

In [125]:
# Garbage collect
gc.collect()

0

In [126]:
# Read in cluster assignments from tsfeat-based clusters and rename the cluster assignemnt field to 'cluster'
tsfeat_clust = pd.read_csv("Results/Clustering/KMeans/kmeans_tsfeat_clustering_assign.csv")
tsfeat_clust['cluster'] = tsfeat_clust['kmeans_tsfeat_clust_assign']

In [127]:
# Merge training and test data with the cluster assignments
train_val_df_full_tsfeat_clust = train_val_df_full.merge(tsfeat_clust, on="ts_index")
test_df_full_tsfeat_clust = test_df_full.merge(tsfeat_clust, on="ts_index")

In [128]:
# Parallel loop through the clusters and train a light gbm model for each clutser
# Trained models are saved into the tsfeat_clust_mods list
with tqdm_joblib(tqdm(desc="tsfeat LGBM Models", 
                      total=len(tsfeat_clust.cluster.unique()))) as progress_bar:
    tsfeat_clust_mods = Parallel(n_jobs=2)(delayed(train_lgbm_clust)(train_val_df_full_tsfeat_clust, 
                                                                      i, 
                                                                      lag_n
                                                                    ) for i in range(1, len(tsfeat_clust.cluster.unique())+1))

tsfeat LGBM Models: 100%|█████████████████████████| 2/2 [00:32<00:00, 16.22s/it]


In [129]:
# For each model in the list, save to a file
for clust_no in range(1, len(tsfeat_clust.cluster.unique())+1):
    filename = f'Results/Global/LightGBM Default/TSFeat KMeans/model_{clust_no}'
    joblib.dump(tsfeat_clust_mods[clust_no-1], filename)

In [130]:
# Load the models from file

# Create an empty list
tsfeat_clust_mods = list()

# Loop through the cluster numbers, load the models, and append to list
for clust_no in range(1, len(tsfeat_clust.cluster.unique())+1):
    filename = f'Results/Global/LightGBM Default/TSFeat KMeans/model_{clust_no}'
    tsfeat_clust_mods.append(joblib.load(filename))

In [131]:
# In parallel, loop through the clusters and compute model residuals. The list of residuals for each model
# is saved as an entry in the tsfeat_clust_residuals list 
with tqdm_joblib(tqdm(desc="tsfeat LGBM Residuals", 
                      total=len(tsfeat_clust.cluster.unique()))) as progress_bar:
    tsfeat_clust_residuals = Parallel(n_jobs=2)(delayed(compute_lgbm_resid_clust)(tsfeat_clust_mods[i-1],
                                                                                  i,
                                                                                  train_val_df_full_tsfeat_clust,
                                                                                  lag_n
                                                                                 ) for i in range(1, len(tsfeat_clust.cluster.unique())+1))

tsfeat LGBM Residuals: 100%|██████████████████████| 2/2 [00:12<00:00,  6.34s/it]


In [132]:
# Create the data frame of residuals
tsfeat_clust_res_df = pd.DataFrame({'cluster': list({(i+1): tsfeat_clust_residuals[i] for i in range(len(tsfeat_clust_residuals))}.keys()),
                                    'residual': list({(i+1): tsfeat_clust_residuals[i] for i in range(len(tsfeat_clust_residuals))}.values())})

# Inspect the data frame of residuals
tsfeat_clust_res_df.head()

Unnamed: 0,cluster,residual
0,1,"[-10.803316036628644, 14.275223393875137, -39...."
1,2,"[35.3488709348197, -13.231145805264703, 16.757..."


In [133]:
# Save the residual df to a file
tsfeat_clust_res_df.to_csv("Results/Global/LightGBM Default/TSFeat KMeans/residual.csv", index=False)

In [134]:
# In parallel, loop through the models and compute the set of test predictions for each cluster
# The df of test preds for each cluster is an entry in the tsfeat_clust_test_preds list
with tqdm_joblib(tqdm(desc="tsfeat LGBM Test Preds", 
                      total=len(tsfeat_clust.cluster.unique()))) as progress_bar:
    tsfeat_clust_test_preds = Parallel(n_jobs=2)(delayed(compute_lgbm_test_preds_clust)(tsfeat_clust_mods[i-1],
                                                                                        i,
                                                                                        test_df_full_tsfeat_clust,
                                                                                        lag_n
                                                                                       ) for i in range(1, len(tsfeat_clust.cluster.unique())+1))

tsfeat LGBM Test Preds: 100%|█████████████████████| 2/2 [00:05<00:00,  2.96s/it]


In [135]:
# Create a data frame of all test set preds by using the concat method on the list of test pred data frames
tsfeat_clust_test_preds_df = pd.concat(tsfeat_clust_test_preds)

# With the full df of test preds, compute prediction performance
tsfeat_clust_test_perf = compute_lgbm_test_perf(tsfeat_clust_test_preds_df,
                                                 test_df_full_tsfeat_clust)

In [136]:
# Add scaled/normalized metrics to the data frame
tsfeat_clust_test_perf['nrmse'] = tsfeat_clust_test_perf['rmse']/tsfeat_clust_test_perf['mean']
tsfeat_clust_test_perf['smae'] = tsfeat_clust_test_perf['mae']/tsfeat_clust_test_perf['mean']

In [137]:
# Print the means of the performance metrics
tsfeat_clust_test_perf.mean()

rmse      31.225672
mae       20.743977
mean     265.435072
nrmse      0.141735
smae       0.096075
dtype: float64

In [138]:
# For each data frame of predictions, compute bootstrap prediction intervals
# Save the df of pred ints for each as an entry in the list tsfeat_clust_test_pred_int
with tqdm_joblib(tqdm(desc="tsfeat LGBM PI", 
                      total=len(tsfeat_clust.cluster.unique()))) as progress_bar:
    tsfeat_clust_test_pred_int = Parallel(n_jobs=2)(delayed(compute_lgbm_boostrap_int)(tsfeat_clust_test_preds[i-1],
                                                                                        tsfeat_clust_residuals[i-1],
                                                                                        1000) for i in range(1, len(tsfeat_clust.cluster.unique())+1))

tsfeat LGBM PI: 100%|████████████████████████████| 2/2 [21:41<00:00, 650.85s/it]


In [139]:
# For each cluster, add the true target values for that cluster as a column to the df of pred ints
for n in range(1, len(tsfeat_clust_test_pred_int)+1):
    
    y_actual_sub = test_df_full_tsfeat_clust.query("cluster==@n").copy().iloc[:,0].to_list()
    
    tsfeat_clust_test_pred_int[n-1]['actual'] = y_actual_sub

In [140]:
# For each cluster, compute 80% and 95% PI scores
for m in range(len(tsfeat_clust_test_pred_int)):
    tsfeat_clust_test_pred_int[m]['int_95_score'] = interval_score(tsfeat_clust_test_pred_int[m]['actual'],
                                                                    tsfeat_clust_test_pred_int[m]['lo_95'],
                                                                    tsfeat_clust_test_pred_int[m]['hi_95'],
                                                                    0.95
                                                                   )

    tsfeat_clust_test_pred_int[m]['int_80_score'] = interval_score(tsfeat_clust_test_pred_int[m]['actual'],
                                                                    tsfeat_clust_test_pred_int[m]['lo_80'],
                                                                    tsfeat_clust_test_pred_int[m]['hi_80'],
                                                                    0.80
                                                                   )

In [141]:
# Append all data frames of PIs into one data frame
tsfeat_clust_test_pred_int_df = pd.concat(tsfeat_clust_test_pred_int)

In [142]:
# Print means of PI scores
tsfeat_clust_test_pred_int_df.mean()

ts_index         38.500000
test_preds      265.658464
lo_95           204.233591
hi_95           329.589478
lo_80           235.936280
hi_80           296.449757
actual          265.435072
int_95_score    224.571612
int_80_score    122.694939
dtype: float64

In [143]:
# Group the PI df by time series index and compute scaled interval scores
tsfeat_clust_test_pred_int_df_grouped = tsfeat_clust_test_pred_int_df.groupby("ts_index")\
.agg({"int_95_score":"mean", "int_80_score":"mean", "actual": "mean"}).reset_index()

tsfeat_clust_test_pred_int_df_grouped['int_95_score_scaled'] = tsfeat_clust_test_pred_int_df_grouped['int_95_score']/tsfeat_clust_test_pred_int_df_grouped['actual']
tsfeat_clust_test_pred_int_df_grouped['int_80_score_scaled'] = tsfeat_clust_test_pred_int_df_grouped['int_80_score']/tsfeat_clust_test_pred_int_df_grouped['actual']

In [144]:
# Print the average scaled interval scores
tsfeat_clust_test_pred_int_df_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.581382
int_95_score_scaled    1.051411
dtype: float64

In [145]:
# Save PI data frame to csv file
tsfeat_clust_test_pred_int_df.to_csv("Results/Global/LightGBM Default/TSFeat KMeans/test_pred_intervals.csv", 
                                      index=False)

# Train and Test - DTW Clusters

In [146]:
# Delete variables no longer in use
del tsfeat_clust_test_pred_int_df
del tsfeat_clust_test_pred_int
del y_actual_sub
del tsfeat_clust_test_perf
del tsfeat_clust_test_preds
del tsfeat_clust_residuals
del tsfeat_clust_mods
del train_val_df_full_tsfeat_clust
del test_df_full_tsfeat_clust
del tsfeat_clust

In [147]:
# Garbage collect
gc.collect()

0

In [148]:
# Read in dtw cluster assignments and add the column called 'cluster' as before
dtw_clust = pd.read_csv("Results/Clustering/DTW/dtw_clustering_assign.csv")
dtw_clust['cluster'] = dtw_clust['dtw_clust_assign']

In [149]:
# Merge the training and test data with the cluster assignments
train_val_df_full_dtw_clust = train_val_df_full.merge(dtw_clust, on="ts_index")
test_df_full_dtw_clust = test_df_full.merge(dtw_clust, on="ts_index")

In [150]:
# In parallel, for each cluster, create a light gbm model and save to list
with tqdm_joblib(tqdm(desc="dtw LGBM Models", 
                      total=len(dtw_clust.cluster.unique()))) as progress_bar:
    dtw_clust_mods = Parallel(n_jobs=2)(delayed(train_lgbm_clust)(train_val_df_full_dtw_clust, 
                                                                  i,
                                                                  lag_n
                                                                 ) for i in range(1, len(dtw_clust.cluster.unique())+1))

dtw LGBM Models: 100%|████████████████████████████| 2/2 [00:34<00:00, 17.46s/it]


In [151]:
# Write these models to files
for clust_no in range(1, len(dtw_clust.cluster.unique())+1):
    filename = f'Results/Global/LightGBM Default/DTW/model_{clust_no}'
    joblib.dump(dtw_clust_mods[clust_no-1], filename)

In [152]:
# Create an empty list
dtw_clust_mods = list()

# Load the models from files into the list
for clust_no in range(1, len(dtw_clust.cluster.unique())+1):
    filename = f'Results/Global/LightGBM Default/DTW/model_{clust_no}'
    dtw_clust_mods.append(joblib.load(filename))

In [153]:
# In parallel, loop through the models created above and compute residuals. Save the list of residuals for each
# model to a list
with tqdm_joblib(tqdm(desc="dtw LGBM Residuals", 
                      total=len(dtw_clust.cluster.unique()))) as progress_bar:
    dtw_clust_residuals = Parallel(n_jobs=2)(delayed(compute_lgbm_resid_clust)(dtw_clust_mods[i-1],
                                                                               i,
                                                                               train_val_df_full_dtw_clust,
                                                                               lag_n
                                                                              ) for i in range(1, len(dtw_clust.cluster.unique())+1))

dtw LGBM Residuals: 100%|█████████████████████████| 2/2 [00:12<00:00,  6.02s/it]


In [154]:
# Create the residual data frame, where the first column is the cluster number and the second are the lists of 
# residuals for that cluster's model
dtw_clust_res_df = pd.DataFrame({'cluster': list({(i+1): dtw_clust_residuals[i] for i in range(len(dtw_clust_residuals))}.keys()),
                                 'residual': list({(i+1): dtw_clust_residuals[i] for i in range(len(dtw_clust_residuals))}.values())})

dtw_clust_res_df.head()

Unnamed: 0,cluster,residual
0,1,"[6.975373456564654, 13.106412046579209, -27.45..."
1,2,"[-3.0769047407929975, 22.192841270775546, -35...."


In [155]:
# Save the df of residuals to a file
dtw_clust_res_df.to_csv("Results/Global/LightGBM Default/DTW/residual.csv", index=False)

In [156]:
# For each model, in parallel, loop through and compute predictions on the test set. Save the prediction data 
# frames to a list
with tqdm_joblib(tqdm(desc="dtw LGBM Test Preds", 
                      total=len(dtw_clust.cluster.unique()))) as progress_bar:
    dtw_clust_test_preds = Parallel(n_jobs=2)(delayed(compute_lgbm_test_preds_clust)(dtw_clust_mods[i-1],
                                                                                     i,
                                                                                     test_df_full_dtw_clust,
                                                                                     lag_n
                                                                                    ) for i in range(1, len(dtw_clust.cluster.unique())+1))

dtw LGBM Test Preds: 100%|████████████████████████| 2/2 [00:05<00:00,  2.54s/it]


In [157]:
# Create a full data frame of test set predictions
dtw_clust_test_preds_df = pd.concat(dtw_clust_test_preds)

# Compute performance using this data frame of all test preds
dtw_clust_test_perf = compute_lgbm_test_perf(dtw_clust_test_preds_df,
                                                 test_df_full_dtw_clust)

In [158]:
# Compute the normalized and scaled performance metrics
dtw_clust_test_perf['nrmse'] = dtw_clust_test_perf['rmse']/dtw_clust_test_perf['mean']
dtw_clust_test_perf['smae'] = dtw_clust_test_perf['mae']/dtw_clust_test_perf['mean']

In [159]:
# Print the means of the performance metrics
dtw_clust_test_perf.mean()

rmse      30.910959
mae       20.525132
mean     265.435072
nrmse      0.139256
smae       0.094090
dtype: float64

In [160]:
# Loop through the preds and residuals for the model for each cluster (in parallel) and compute a new data frame
# with bootstrap PIs. Save these data frames to a list
with tqdm_joblib(tqdm(desc="dtw LGBM PI", 
                      total=len(dtw_clust.cluster.unique()))) as progress_bar:
    dtw_clust_test_pred_int = Parallel(n_jobs=2)(delayed(compute_lgbm_boostrap_int)(dtw_clust_test_preds[i-1],
                                                                                        dtw_clust_residuals[i-1],
                                                                                        1000) for i in range(1, len(dtw_clust.cluster.unique())+1))

dtw LGBM PI: 100%|███████████████████████████████| 2/2 [20:50<00:00, 625.24s/it]


In [161]:
# For each cluster, grab the true values for the target variable and add those as a column to the PI data frame
for n in range(1, len(dtw_clust_test_pred_int)+1):
    
    y_actual_sub = test_df_full_dtw_clust.query("cluster==@n").copy().iloc[:,0].to_list()
    
    dtw_clust_test_pred_int[n-1]['actual'] = y_actual_sub

In [162]:
# For each cluster's PI df, compute the 95% and 80% PI scores
for m in range(len(dtw_clust_test_pred_int)):
    dtw_clust_test_pred_int[m]['int_95_score'] = interval_score(dtw_clust_test_pred_int[m]['actual'],
                                                                    dtw_clust_test_pred_int[m]['lo_95'],
                                                                    dtw_clust_test_pred_int[m]['hi_95'],
                                                                    0.95
                                                                   )

    dtw_clust_test_pred_int[m]['int_80_score'] = interval_score(dtw_clust_test_pred_int[m]['actual'],
                                                                    dtw_clust_test_pred_int[m]['lo_80'],
                                                                    dtw_clust_test_pred_int[m]['hi_80'],
                                                                    0.80
                                                                   )

In [163]:
# Loop through the list of PI data frames and append to one data frame
dtw_clust_test_pred_int_df = pd.concat(dtw_clust_test_pred_int)
# for pred_int_df_clust in dtw_clust_test_pred_int:
#     dtw_clust_test_pred_int_df = dtw_clust_test_pred_int_df.append(pred_int_df_clust)

In [164]:
# Print means of PI scores
dtw_clust_test_pred_int_df.mean()

ts_index         38.500000
test_preds      265.789873
lo_95           207.626934
hi_95           326.228252
lo_80           235.786707
hi_80           296.796925
actual          265.435072
int_95_score    200.024829
int_80_score    115.749048
dtype: float64

In [165]:
# Compute normalized interval scores
dtw_clust_test_pred_int_df_grouped = dtw_clust_test_pred_int_df.groupby("ts_index")\
.agg({'int_95_score':'mean', 'int_80_score':'mean', 'actual':'mean'}).reset_index()

dtw_clust_test_pred_int_df_grouped['int_95_score_scaled'] = dtw_clust_test_pred_int_df_grouped['int_95_score']/dtw_clust_test_pred_int_df_grouped['actual']
dtw_clust_test_pred_int_df_grouped['int_80_score_scaled'] = dtw_clust_test_pred_int_df_grouped['int_80_score']/dtw_clust_test_pred_int_df_grouped['actual']

In [166]:
# Print average normalized interval scores
dtw_clust_test_pred_int_df_grouped[['int_80_score_scaled', 'int_95_score_scaled']].mean()

int_80_score_scaled    0.554662
int_95_score_scaled    0.953065
dtype: float64

In [167]:
# Save df of PIs to a csv file
dtw_clust_test_pred_int_df.to_csv("Results/Global/LightGBM Default/DTW/test_pred_intervals.csv", 
                                      index=False)