In [1]:
import os
import requests

def detect_environment():
    # Check for Kaggle
    if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
        print("Environment: Kaggle")
        return 'kaggle'
    # Check for AWS
    try:
        response = requests.get('http://169.254.169.254/latest/meta-data/', timeout=2)
        if response.status_code == 200:
            print("Environment: AWS")
            return 'aws'
    except requests.exceptions.RequestException:
        pass
    # Default to local
    print("Environment: Local")
    return 'local'

In [2]:
# Setting to adjust before each run:
CODE_ENV = detect_environment()
STATUS = 'training'

Environment: Local


In [3]:
if CODE_ENV == 'kaggle':
    !pip install keras_self_attention

In [4]:
#Import data handling libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import shutil
import glob
import math
from pathlib import Path
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Input, LSTM, Dense, Embedding, Dropout, Reshape, 
                                     concatenate, Flatten, Bidirectional, GlobalAveragePooling1D)
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.losses import Huber, MeanAbsoluteError
from tensorflow.keras.regularizers import l2
from tensorflow.keras.initializers import GlorotNormal
from tensorflow.keras.callbacks import Callback, EarlyStopping
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from keras_self_attention import SeqSelfAttention
from keras_tuner import HyperModel
from keras_tuner.tuners import RandomSearch, BayesianOptimization

In [5]:
# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.test.is_built_with_cuda())

Num GPUs Available:  0
False


In [6]:
#Specify directories
if CODE_ENV=='local':
    ###local###
    #get parent folder of current directory
    parent_dir = '/Users/mf/Desktop/CS/Studies/7_Final_Project/Kaggle_M5PointPrediction'

    #Directory resources
    res_dir = parent_dir + '/res/'
    src_dir = parent_dir + '/src/'
    prc_dir = src_dir + 'processed_data/' # Processed data directory with pickled dataframes
    sub_dir = src_dir + 'submissions/' # Directory to save submission files
    LOGGING_DIR = src_dir + 'models/hyperparameter_tuning'
    CSV_PATH = LOGGING_DIR + '/hyperparameter_search_results.csv'

if CODE_ENV=='kaggle':
    ###On Kaggle###
    res_dir = '/kaggle/input/m5-forecasting-accuracy/'
    prc_dir = '/kaggle/input/processed-data/'
    src_dir = '/kaggle/working/'
    sub_dir = src_dir + 'submissions/'
    LOGGING_DIR = src_dir + 'hyperparameter_tuning'
    CSV_PATH = LOGGING_DIR + '/hyperparameter_search_results.csv'

if CODE_ENV=='aws':
    parent_dir = '/home/ubuntu/projects/Kaggle_M5PointPrediction'
    res_dir = parent_dir + '/res/'
    src_dir = parent_dir + '/src/'
    prc_dir = src_dir + 'processed_data/' # Processed data directory with pickled dataframes
    sub_dir = src_dir + 'submissions/' # Directory to save submission files
    LOGGING_DIR = src_dir + 'hyperparameter_tuning'
    CSV_PATH = LOGGING_DIR + '/hyperparameter_search_results.csv'

In [7]:
# Create variables
VALIDATION_DATA  = prc_dir +'df_1.pkl' # Validation data
BASE      = prc_dir +'df_2.pkl' # Base data
CALENDAR  = prc_dir +'df_3.pkl' # Calendar data
# NUM_ITEMS = 30490 # Number of items per each day

DAYS_PER_SEQUENCE = 28  # Length of the sequence
MAX_BATCH_SIZE = 900 # Maximum number of ids to be used in each batch to avoid memory issues and curse of dimensionality


TARGET_COL = 'sales_amount'
# REPEATED_FEATURES = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'sales_amount', 'sell_price', 'is_available'] # List to hold all feature columns that are used for each item
REPEATED_FEATURES = ['sales_amount', 'sell_price', 'is_available',
                     'sales_amount_moving_avg_7', 'sales_amount_moving_avg_28', 'sales_amount_lag_1',
                     'zero_sales_available', 'consecutive_zero_sales'] # List to hold all feature columns that are used for each item
SALES_AMOUNT_COLS = ['sales_amount', 'sales_amount_moving_avg_7', 'sales_amount_moving_avg_28', 'sales_amount_lag_1']
# ONCE_ONLY_FEATURES = ['d', 'wday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'mday', 'week', 'month', 'year', 'snap_CA', 'snap_TX', 'snap_WI'] # List to hold feature columns that are not repeated for each item
ONCE_ONLY_FEATURES = ['snap_CA', 'snap_TX', 'snap_WI', 'mday_normalized', 'day_continuous_normalized',
                      'month_sin', 'month_cos', 'wday_sin', 'wday_cos', 'week_sin', 'week_cos', 
                      'year_normalized'] # List to hold feature columns that are not repeated for each item
EVENT_COLS = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
EVENT_LEN = len(EVENT_COLS)
NOT_NEEDED_COLS = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']

In [8]:
# Set test_end to 1969 in case of production
if STATUS=='production':
    TEST_END = 1969
elif STATUS=='training':
    TEST_END = 1969 #1941

# Splitting the data in train, validation and test set; days are now 0 based, so have to shift by 1
# Define duration in days of each set
VAL_DUR   = 28
TEST_DUR  = 28

# Define end days of training set for each set
VAL_END   = TEST_END - TEST_DUR
TRAIN_END = VAL_END - VAL_DUR # 1885 -> Train only until the 28 days before the end of the data

# Finally define duration in days for the train set
TRAIN_DUR = TRAIN_END - DAYS_PER_SEQUENCE# Depends on whether the whole dataset is used or last the 28 days for validation 

In [9]:
# Read in df_train_conv from pickle file
def get_whole_data():
    df_all_data = pd.concat([pd.read_pickle(BASE),
           pd.read_pickle(CALENDAR)], 
           axis=1)
    return df_all_data

In [10]:
# pd.set_option('display.max_rows', None)
# df_all_data = get_whole_data()
# df_all_data.head()

Unnamed: 0,item_id,dept_id,cat_id,store_id,state_id,sales_amount,sell_price,is_available,sales_amount_lag_1,sales_amount_moving_avg_7,...,snap_WI,month_sin,month_cos,wday_sin,wday_cos,week_sin,week_cos,mday_normalized,day_continuous_normalized,year_normalized
0,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0,0.0,0.0,...,0,0.5,0.866211,0.781738,0.623535,0.456629,0.889657,0.933333,0.000508,0.0
1,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0,0.0,0.0,...,0,0.5,0.866211,0.781738,0.623535,0.456629,0.889657,0.933333,0.000508,0.0
2,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0,0.0,0.0,...,0,0.5,0.866211,0.781738,0.623535,0.456629,0.889657,0.933333,0.000508,0.0
3,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0,0.0,0.0,...,0,0.5,0.866211,0.781738,0.623535,0.456629,0.889657,0.933333,0.000508,0.0
4,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0,0.0,0.0,...,0,0.5,0.866211,0.781738,0.623535,0.456629,0.889657,0.933333,0.000508,0.0


In [11]:
# get all ['store_id','dept_id'] combinations from df_all_data and count the occurences
# df_combinations = df_all_data[(df_all_data['d']==1)].groupby(['store_id', 'dept_id']).size().reset_index(name='count')  


In [12]:
# pd.set_option('display.max_rows', None)
# df_combinations

Unnamed: 0,store_id,dept_id,count
0,CA_1,FOODS_1,216
1,CA_1,FOODS_2,398
2,CA_1,FOODS_3,823
3,CA_1,HOBBIES_1,416
4,CA_1,HOBBIES_2,149
5,CA_1,HOUSEHOLD_1,532
6,CA_1,HOUSEHOLD_2,515
7,CA_2,FOODS_1,216
8,CA_2,FOODS_2,398
9,CA_2,FOODS_3,823


In [13]:
#pd.set_option('display.max_rows', None)
# df_combinations[df_combinations['store_id'] == 'TX_1'].sort_values('count').tail(50)

In [14]:
# Return a df with all unique combinations of store_id and dept_id
def get_combinations(df_all_data):
    # get all store_id and dept_id combinations
    df_combinations_store_dep = df_all_data[['store_id','dept_id']].drop_duplicates().reset_index(drop=True)
    # get the length of the df_combinations_store_dep
    df_length = len(df_combinations_store_dep)

    return df_combinations_store_dep, df_length

In [15]:
# Filter df down to only the current store_id and dept_id combination
def filter_df(df_combinations_store_dep, df_all_data, i):
    store_id = df_combinations_store_dep.loc[i, 'store_id']
    dept_id = df_combinations_store_dep.loc[i, 'dept_id']
    ids = df_all_data[(df_all_data['store_id']==store_id) & (df_all_data['dept_id']==dept_id)]['id'].drop_duplicates().values
    filtered_df = df_all_data[(df_all_data['store_id']==store_id) & (df_all_data['dept_id']==dept_id)].reset_index(drop=True)
    filtered_df.reset_index(drop=True, inplace=True) ##################################################????

    # Remove all unused columns
    filtered_df.drop(NOT_NEEDED_COLS, axis=1, inplace=True)

    # Get the number of block items
    num_block_items = len(ids)

    # Get the number of features
    num_features = len(ONCE_ONLY_FEATURES) + len(REPEATED_FEATURES) * num_block_items # Calculate the number of features

    # Get the input shape later on for the model
    input_shape = (DAYS_PER_SEQUENCE, num_features)

    return filtered_df, num_block_items, num_features, input_shape

In [16]:
def calc_vocab_size(filtered_df, embedding_dims_max=50):
    vocab_size=[]
    embedding_dims=[]
    # count the unique entries of event_name_1 event_type_1 event_name_2 event_type_2
    # append the number of unique entries to the list vocab_size
    vocab_size.append(len(filtered_df['event_name_1'].unique()))
    vocab_size.append(len(filtered_df['event_type_1'].unique()))
    vocab_size.append(len(filtered_df['event_name_2'].unique()))
    vocab_size.append(len(filtered_df['event_type_2'].unique()))
    
    # loop over all other indices and calculate the embedding dimensions
    for i in range(0, len(vocab_size)):
        embedding_dims.append(int(embedding_dims_max * (vocab_size[i]/max(vocab_size))))

    return vocab_size, embedding_dims

In [17]:
# Normalize numerical columns
def prepare_df(df_all_data):
    # Define categorical and numerical columns
    categorical_cols = ['id'] #'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'snap_CA', 'snap_TX', 'snap_WI', 'is_available'
    
    numerical_cols = ['sell_price']

    # Convert categorical columns to category dtype and encode with cat.codes
    for col in categorical_cols:
        df_all_data[col] = df_all_data[col].astype('category').cat.codes

    # Adjust the event cols
    # 1. Create an encoder instance for each column
    encoders = {col: LabelEncoder() for col in EVENT_COLS}

    # Apply encoding to each column
    for col, encoder in encoders.items():
        df_all_data[col] = encoder.fit_transform(df_all_data[col].astype(str)).astype('int8')

    # Normalize numerical columns
    scaler_numerical = MinMaxScaler()
    df_all_data[numerical_cols] = scaler_numerical.fit_transform(df_all_data[numerical_cols].astype(np.float32))

    # scaler_target = MinMaxScaler() #not used any more
    # df_all_data[SALES_AMOUNT_COLS] = scaler_target.fit_transform(df_all_data[SALES_AMOUNT_COLS].astype(np.float64))
    df_all_data[SALES_AMOUNT_COLS] = df_all_data[SALES_AMOUNT_COLS].apply(np.log1p)

    return df_all_data#, scaler_target

In [18]:
def train_test_split(df_all_data):
    # For training split up between train and validation dataset, else use all for training and create test dataset
    if STATUS=='training':
        df_train = df_all_data[df_all_data['d'] <= TRAIN_END].reset_index(drop=True)
        df_val   = df_all_data[(df_all_data['d'] > TRAIN_END - DAYS_PER_SEQUENCE) & (df_all_data['d'] <= VAL_END)].reset_index(drop=True) #more than 28 days because of the time_steps shift
        df_test  = None
        
    elif STATUS=='production':
        df_train = df_all_data[df_all_data['d'] <= VAL_END].reset_index(drop=True)
        df_test  = df_all_data[(df_all_data['d'] > VAL_END - DAYS_PER_SEQUENCE)   & (df_all_data['d'] <= TEST_END)].reset_index(drop=True) #more than 28 days because of the time_steps shift
        df_val   = None

    # Delete df_all_data to free up memory as data is now stored in df_train, df_val and df_test
    del df_all_data

    return df_train, df_val, df_test

In [19]:
### Create x and y in one go without the generator version autogeneration ###
def create_x_y(df, num_block_items):
    length_days = len(df) // num_block_items
    x = []
    y = []
    events = []

    for i in range(0, length_days - DAYS_PER_SEQUENCE):
        start_ind = i * num_block_items
        end_ind = start_ind + num_block_items * (DAYS_PER_SEQUENCE)  # predict the next day after the sequence

        # Extract once-only features for all days in the sequence at once
        once_features = df.iloc[start_ind:end_ind:num_block_items][ONCE_ONLY_FEATURES].to_numpy()

        # Get event columns
        event_features = df.iloc[start_ind:end_ind:num_block_items][EVENT_COLS].to_numpy()

        # Extract repeated features for all items and days at once
        repeated_features_stack = df.iloc[start_ind:end_ind][REPEATED_FEATURES].to_numpy() # 210,000 items, 10 features

        # Reshape to a 3D array: 7 days, 30,000 items per day, 10 features
        reshaped_3d = repeated_features_stack.reshape(DAYS_PER_SEQUENCE, num_block_items, len(REPEATED_FEATURES))

        # Reshape to a 2D array: 7 days, 30,000 items * 10 features each
        final_array = reshaped_3d.reshape(DAYS_PER_SEQUENCE, -1)

        # Combine once-only and repeated features
        batch_sequences = np.concatenate((once_features, final_array), axis=1)

        # Extract targets
        batch_targets = df.iloc[end_ind:end_ind + num_block_items][[TARGET_COL]].to_numpy().flatten()

        # Append to x, y and events
        x.append(batch_sequences)
        events.append(event_features)
        y.append(batch_targets)

    train_x = np.array(x)
    train_event_x = np.array(events)
    train_y = np.array(y)

    train_x = [train_x, train_event_x[:,:,0], train_event_x[:,:,1], train_event_x[:,:,2], train_event_x[:,:,3]]

    return train_x, train_y


In [20]:
# Get the training data and labels array for the LSTM model
def get_x_and_y(df_train, df_val, df_test, num_block_items):
    train_x, train_y = create_x_y(df_train, num_block_items)

    if STATUS=='training':
        val_x, val_y = create_x_y(df_val, num_block_items)
    elif STATUS=='production': 
        val_x, val_y = None, None

    # df_train not needed anymore
    del df_train

    return train_x, train_y, val_x, val_y

In [21]:
# Custom RMSE loss function
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [22]:
class ResetStatesCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        self.model.reset_states()

In [23]:
# Perform feature engineering
"""
#####- these columns have to be update in the next notebook based on the predictions made by the model #####
- 1 days lag #float 64
- moving average for 7 and 28 days #float 64
- is there a price reduction?
- is there a price increase?
- adjust for inflation?
- consumer sentiment
- holiday
- weather
- 
"""
def feature_engineering(df, num_block_items): 
    ################## lag 1 day sales amount ##############################################################################
    # After shifting the first days values are NAN but not important as we skip them because we start with the second day
    df['sales_amount_lag_1'] = df['sales_amount'].shift(num_block_items)
    ########################################################################################################################

    ################## moving average 7 and 28 days #########################
    df['sales_amount_moving_avg_7'] = df.groupby('id')['sales_amount'].transform(lambda x: x.rolling(window=7).mean())
    df['sales_amount_moving_avg_7'] = df['sales_amount_moving_avg_7'].fillna(method='bfill')

    df['sales_amount_moving_avg_28'] = df.groupby('id')['sales_amount'].transform(lambda x: x.rolling(window=28).mean())
    df['sales_amount_moving_avg_28'] = df['sales_amount_moving_avg_28'].fillna(method='bfill')
    #########################################################################

    ################# days consecutive zero sales and if an entry means that this is a zero sale  #########################
    # Step 1: Mark zero sales days where item is available
    df['zero_sales_available'] = np.where((df['sales_amount'] == 0) & (df['is_available'] == 1), 1, 0).astype(np.int8)

    # Function to apply to each group
    def calculate_consecutive_zeros(group):
        # Step 2: Identify change points to reset the count for consecutive zeros
        group['block'] = (group['zero_sales_available'] == 0).cumsum().astype(np.int16)
        
        # Step 3: Count consecutive zeros within each block
        group['consecutive_zero_sales'] = group.groupby('block').cumcount()
        
        # Reset count where 'zero_sales_available' is 0, as these are not zero sales days or the item is not available
        group['consecutive_zero_sales'] = np.where(group['zero_sales_available'] == 1, group['consecutive_zero_sales'], 0).astype(np.int16)
        
        return group

    # Apply the function to each item group
    df = df.groupby('id', group_keys=False).apply(calculate_consecutive_zeros)

    # Drop the 'block' column because no longer needed
    del df['block']

    return df
########################################################################################################################

In [35]:
# for each store_id and dept_id call get whole data, filter for store_id and dept_id
def lstm_pipeline(verbose, num_loop_start, num_loop_end):
    # Delete directory for a clean new run and logging
    remove_directory(LOGGING_DIR)

    #Get all data
    df_all_data = get_whole_data()

    # Get all store_id and dept_id combinations
    df_combinations_store_dep, num_combinations = get_combinations(df_all_data)

    # define the number of loops
    # num_loop = 1 if verbose == 1 else num_combinations

    # Loop over all store_id and dept_id combinations, create a model, train it, create the prediction and save it to a file
    for counter in range(num_loop_start, num_loop_end):
        ############## For debugging purposes ##############
        store_id = df_combinations_store_dep.loc[counter, "store_id"]
        dept_id = df_combinations_store_dep.loc[counter, "dept_id"]

        print(f'Processing {counter+1} of {num_combinations}: store_id {store_id} and dept_id {dept_id}')
        ####################################################

        # Filter df down to only the current store_id and dept_id combination
        filtered_df, num_block_items, num_features, input_shape = filter_df(df_combinations_store_dep, df_all_data, counter)

        ############## For debugging purposes ##############
        print(f'Number of ids in this batch: {num_block_items}')
        ####################################################

        # Calculate the vocab size for the embedding layers later when model is defined
        vocab_sizes, embedding_dims = calc_vocab_size(filtered_df) # Funktioniert nur, wenn num_batches 1 ist, sonst muss komplexere Berechnung innerhalb des loops erfolgen

        # Prepare the data for training
        filtered_df = prepare_df(filtered_df)

        # Split the data into train, validation and test set
        df_train, df_val, df_test = train_test_split(filtered_df)

        # Create training, validation and test data arrays from the dataframes
        train_x, train_y, val_x, val_y = get_x_and_y(df_train, df_val, df_test, num_block_items)

        # Do grid search and log to file
        start_search(train_x, train_y,
                     val_x, val_y,
                     input_shape, num_block_items, vocab_sizes,
                     embedding_dims, counter, store_id, dept_id,
                     csv_path=CSV_PATH)

In [25]:
# Remove the logging directory such that not old states are used before new grid search runs begin
def remove_directory(dir_path):
    dir = Path(dir_path)
    if dir.exists() and dir.is_dir():
        shutil.rmtree(dir)

In [26]:
# Cleanup of checkpoint files which are quite large after each run
def delete_checkpoint_files(checkpoint_dir):
    # Pattern matching the checkpoint files, take too much space to manually delete them
    pattern = os.path.join(checkpoint_dir, "**", "checkpoint.data*")    
    # Find and delete the files
    for file in glob.glob(pattern, recursive=True):
        os.remove(file)

In [27]:
class TweedieLoss(tf.keras.losses.Loss):
    def __init__(self, p, name="TweedieLoss"):
        super().__init__(name=name)
        self.p = p

    def call(self, y_true, y_pred):
        # Ensure predictions are strictly positive
        epsilon = 1e-8
        y_pred = tf.maximum(y_pred, epsilon)

        # Tweedie loss calculation
        loss = -y_true * tf.pow(y_pred, 1 - self.p) / (1 - self.p) + \
               tf.pow(y_pred, 2 - self.p) / (2 - self.p)
        return tf.reduce_mean(loss)

In [28]:
def quantile_loss(q, y_true, y_pred):
    e = y_true - y_pred
    return tf.reduce_mean(tf.maximum(q * e, (q - 1) * e), axis=-1)

In [29]:
def custom_loss_wrapper(p_value, q_value, loss_choice):
    def custom_loss(y_true, y_pred):
        if loss_choice == 'tweedie':
            return TweedieLoss(p_value)(y_true, y_pred)
        elif loss_choice == 'mean_absolute_error':
            return MeanAbsoluteError()(y_true, y_pred)
        elif loss_choice == 'huber':
            return Huber()(y_true, y_pred)
        elif loss_choice == 'quantile_loss':
            return quantile_loss(q_value, y_true, y_pred)
    return custom_loss

In [30]:
# Prepare model and model params for hyperparameter tuning with grid search
class LSTMTuningModel(HyperModel):
    def __init__(self, input_shape, num_block_items, vocab_sizes, embedding_dims, metrics, batch_size):
        self.input_shape = input_shape
        self.num_block_items = num_block_items
        self.vocab_sizes = vocab_sizes
        self.embedding_dims = embedding_dims
        self.metrics = metrics
        self.batch_size = batch_size
    
    def build(self, hp):
        ################################ Hyperparameters ################################
        # Embeddings
        # embedding_dim_options = [16, 32, 64]

        # Give global min max values for layers
        num_lstm_layers_min = 0
        num_lstm_layers_max = 3

        lstm_units_min      = 32
        lstm_units_max      = 256
        lstm_step_size      = 32

        activation_choice   = ['relu', 'tanh', 'sigmoid']
        activation_choice_bidirectional = ['tanh', 'sigmoid']
        self_att_activation = ['sigmoid', 'softmax', 'tanh', 'linear']

        recurr_dropout_min  = 0.1
        recurr_dropout_max  = 0.5
        recurr_dropout_step = 0.1

        l2_reg_min          = 1e-4
        l2_reg_max          = 1e-2

        dropout_rate_min    = 0.1
        dropout_rate_max    = 0.5
        dropout_rate_step   = 0.1

        # Learning rate
        initial_learning_rate = hp.Float('initial_learning_rate', 1e-4, 1e-2, sampling='log') * (math.sqrt(self.batch_size / 32) * 2 - 1)

        # Loss
        p_value               = hp.Float('p_value', 1.1, 1.2, step=0.1)
        q_value               = hp.Float('quantile', 0.8, 0.95, step=0.05)
        loss_choice           = hp.Choice('loss_function', values=['tweedie']) #'quantile_loss', 'mean_absolute_error', 'huber', 
        ####################################################################################

        ############################# Model architecture #####################################
        # Inputs
        numerical_input = Input(shape=self.input_shape, name='numerical_input')
        event_input = [Input(shape=(DAYS_PER_SEQUENCE,), name=f'event_input_{i}') for i in range(1, 5)]
        
        initializer = GlorotNormal(seed=42)

        # Embeddings
        cat_embeddings = [Embedding(input_dim=self.vocab_sizes[i], 
                                    output_dim=self.embedding_dims[i], 
                                    input_length=DAYS_PER_SEQUENCE, 
                                    embeddings_initializer=initializer)(event_input[i]) for i in range(4)]

        combined_input = concatenate([numerical_input] + cat_embeddings)

        # Define the layers, at least 1 Bidirectional layer
        lstm_out = Bidirectional(
                    LSTM(units=hp.Int('lstm_units_bidirectional', lstm_units_min, lstm_units_max, lstm_step_size),
                        activation=hp.Choice('activation_bidirectional', values=activation_choice_bidirectional),
                        return_sequences=True,
                        recurrent_dropout=hp.Float('recurrent_dropout_bidirectional', recurr_dropout_min, recurr_dropout_max, recurr_dropout_step), 
                        kernel_regularizer=l2(hp.Float('l2_reg_bidirectional', l2_reg_min, l2_reg_max, sampling='log')), 
                        kernel_initializer=initializer))(combined_input)
        lstm_out = Dropout(hp.Float('dropout_1', dropout_rate_min, dropout_rate_max, dropout_rate_step))(lstm_out)

        # Varying numbers of following LSTM layers possible
        for i in range(0, hp.Int('num_lstm_layers', num_lstm_layers_min, num_lstm_layers_max)):
            lstm_out = LSTM(
                units=hp.Int(f'lstm_units_layer_{i}', lstm_units_min, lstm_units_max, lstm_step_size),
                activation=hp.Choice(f'activation_lstm_layer_{i}', values=activation_choice),  
                return_sequences=True,
                recurrent_dropout=hp.Float(f'lstm_dropout_layer_{i}', recurr_dropout_min, recurr_dropout_max, recurr_dropout_step),
                kernel_regularizer=l2(hp.Float(f'l2_reg_lstm_layer_{i}', l2_reg_min, l2_reg_max, sampling='log')),
                kernel_initializer=initializer)(lstm_out)
            lstm_out = Dropout(hp.Float(f'dropout_{i}', dropout_rate_min, dropout_rate_max, dropout_rate_step))(lstm_out)

        # Attention layer
        attention_out = SeqSelfAttention(
                            attention_activation=hp.Choice('activation_self_att', 
                            values=self_att_activation), 
                            kernel_initializer=initializer)(lstm_out)
        pooled_output = GlobalAveragePooling1D()(attention_out)
        output = Dense(
                    self.num_block_items, 
                    kernel_regularizer=l2(hp.Float('l2_reg_dense', l2_reg_min, l2_reg_max, sampling='log')),
                    kernel_initializer=initializer)(pooled_output)

        model = Model(inputs=[numerical_input] + event_input, outputs=output)
        
        ####################################################################################

        lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=initial_learning_rate,
            decay_steps=int(1000*math.sqrt(self.batch_size / 32)),
            decay_rate=0.9)

        model.compile(
            optimizer=Adam(learning_rate=lr_schedule, clipvalue=0.5),
            loss=custom_loss_wrapper(p_value, q_value, loss_choice),
            metrics=[self.metrics])

        return model

In [31]:
def create_tuner(input_shape, num_block_items, vocab_sizes, embedding_dims, counter, store_id, dept_id, batch_size, directory):
    hypermodel = LSTMTuningModel(
        input_shape=input_shape, 
        num_block_items=num_block_items, 
        vocab_sizes=vocab_sizes, 
        embedding_dims=embedding_dims, 
        metrics=MeanAbsoluteError(),
        batch_size=batch_size)

    # Setting up the tuner, for example, using RandomSearch
    tuner = BayesianOptimization(
        hypermodel,
        objective='val_loss',
        max_trials=100, #The maximum number of hyperparameter combinations to test
        executions_per_trial=1,
        directory=directory,
        project_name='tuning_combination_' + str(counter) + '_storeId_' + str(store_id) + '_deptId_' + str(dept_id) + '_batch_size_' + str(batch_size))

    return tuner

In [32]:
def start_search(train_x, train_y, val_x, val_y, input_shape, num_block_items, vocab_sizes, embedding_dims, counter, store_id, dept_id, csv_path):
    batch_sizes = [32, 64, 128, 256]
    search_results = []
    
    early_stopping = EarlyStopping(
        monitor='val_loss', 
        patience=3, 
        min_delta=0.4,      # Minimum change to qualify as an improvement
        restore_best_weights=True)

    for batch_size in batch_sizes:
        # Create and configure your hypermodel and tuner for the current combination
        tuner = create_tuner(input_shape, 
                             num_block_items,
                             vocab_sizes,
                             embedding_dims,
                             counter,
                             store_id,
                             dept_id,
                             batch_size=batch_size,
                             directory=LOGGING_DIR)
        # Start the search
        tuner.search(
            train_x, 
            train_y,
            epochs=25,
            batch_size=batch_size,
            validation_data=(val_x, val_y),
            callbacks=[early_stopping],
            verbose=1)

        # Get the best 3 trials
        best_trials = tuner.oracle.get_best_trials(num_trials=3)

        for rank, trial in enumerate(best_trials, start=1):
            best_hyperparams = trial.hyperparameters.values
            # Create a dictionary to store the results for each trial, including its rank
            result = {
                'rank': rank,
                'store_id': store_id,
                'dept_id': dept_id,
                'batch_size': batch_size,
                'val_loss': trial.score
            }

            # Add each hyperparameter to the result dictionary
            for param, value in best_hyperparams.items():
                result[param] = value

            search_results.append(result)

        delete_checkpoint_files(LOGGING_DIR)

    # Write results to a csv file
    append_search_results_to_csv(search_results, csv_path)

In [33]:
def append_search_results_to_csv(search_results, csv_path):
    # Convert search results to a DataFrame
    search_results_df = pd.DataFrame(search_results)
    
    # Check if the CSV file already exists
    if os.path.exists(csv_path):
        # Append without writing the header
        search_results_df.to_csv(csv_path, mode='a', header=False, index=False)
    else:
        # Create a new file with header
        search_results_df.to_csv(csv_path, mode='w', header=True, index=False)

In [36]:
forecast_df = lstm_pipeline(verbose=0, num_loop_start=0, num_loop_end=70)

In [None]:
# os.remove(src_dir + 'state.db')