In [1]:
# Setting to adjust before each run:
MODEL_NAME = 'V3_ohne_Cat_features_block_items'
CODE_ENV = 'local' #'kaggle', 'aws', 'local'
STATUS = 'training' #'training', 'production'

In [2]:
#Import data handling libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (Input, LSTM, Dense, Embedding, Dropout, Reshape, 
                                     concatenate, Flatten, Bidirectional, GlobalAveragePooling1D)
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.initializers import GlorotNormal
from tensorflow.keras.callbacks import Callback
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from keras_self_attention import SeqSelfAttention

In [3]:
# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.test.is_built_with_cuda())

Num GPUs Available:  0
False


In [4]:
#Specify directories
if CODE_ENV=='local':
    ###local###
    #get parent folder of current directory
    parent_dir = '/Users/mf/Desktop/CS/Studies/7_Final_Project/Kaggle_M5PointPrediction'

    #Directory resources
    res_dir = parent_dir + '/res/'
    src_dir = parent_dir + '/src/'
    prc_dir = src_dir + 'processed_data/' # Processed data directory with pickled dataframes
    sub_dir = src_dir + 'submissions/' # Directory to save submission files

if CODE_ENV=='kaggle':
    ###On Kaggle###
    res_dir = '/kaggle/input/m5-forecasting-accuracy/'
    prc_dir = '/kaggle/input/processed-data/'
    src_dir = '/kaggle/working/'
    sub_dir = src_dir + 'submissions/'

if CODE_ENV=='aws':
    parent_dir = '/home/ubuntu/projects/Kaggle_M5PointPrediction'
    res_dir = parent_dir + '/res/'
    src_dir = parent_dir + '/src/'
    prc_dir = src_dir + 'processed_data/' # Processed data directory with pickled dataframes
    sub_dir = src_dir + 'submissions/' # Directory to save submission files

In [5]:
# Create variables
VALIDATION_DATA  = prc_dir +'df_1.pkl' # Validation data
BASE      = prc_dir +'df_2.pkl' # Base data
CALENDAR  = prc_dir +'df_3.pkl' # Calendar data
# NUM_ITEMS = 30490 # Number of items per each day

DAYS_PER_SEQUENCE = 1  # Length of the sequence
MAX_BATCH_SIZE = 900 # Maximum number of ids to be used in each batch to avoid memory issues and curse of dimensionality


TARGET_COL = 'sales_amount'
# REPEATED_FEATURES = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'sales_amount', 'sell_price', 'is_available'] # List to hold all feature columns that are used for each item
REPEATED_FEATURES = ['sales_amount', 'sell_price', 'is_available',
                     'sales_amount_moving_avg_7', 'sales_amount_moving_avg_28', 'sales_amount_lag_1',
                     'zero_sales_available', 'consecutive_zero_sales'] # List to hold all feature columns that are used for each item
SALES_AMOUNT_COLS = ['sales_amount', 'sales_amount_moving_avg_7', 'sales_amount_moving_avg_28', 'sales_amount_lag_1']
# ONCE_ONLY_FEATURES = ['d', 'wday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'mday', 'week', 'month', 'year', 'snap_CA', 'snap_TX', 'snap_WI'] # List to hold feature columns that are not repeated for each item
ONCE_ONLY_FEATURES = ['snap_CA', 'snap_TX', 'snap_WI', 'mday_normalized', 'day_continuous_normalized',
                      'month_sin', 'month_cos', 'wday_sin', 'wday_cos', 'week_sin', 'week_cos', 
                      'year_normalized', ] # List to hold feature columns that are not repeated for each item
EVENT_COLS = ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
EVENT_LEN = len(EVENT_COLS)

In [6]:
# Set test_end to 1969 in case of production
if STATUS=='production':
    TEST_END = 1969
elif STATUS=='training':
    TEST_END = 1941

# Splitting the data in train, validation and test set; days are now 0 based, so have to shift by 1
# Define duration in days of each set
VAL_DUR   = 28
TEST_DUR  = 28

# Define end days of training set for each set
VAL_END   = TEST_END - TEST_DUR
TRAIN_END = VAL_END - VAL_DUR # 1885 -> Train only until the 28 days before the end of the data

# Finally define duration in days for the train set
TRAIN_DUR = TRAIN_END - DAYS_PER_SEQUENCE# Depends on whether the whole dataset is used or last the 28 days for validation 

In [7]:
# Read in df_train_conv from pickle file
def get_whole_data():
    df_all_data = pd.concat([pd.read_pickle(BASE),
           pd.read_pickle(CALENDAR)], 
           axis=1)
    return df_all_data

In [8]:
# df_all_data = get_whole_data()

In [9]:
# item_id = "HOUSEHOLD_1_474"
# store_id = "CA_2"

# df_all_data[(df_all_data['item_id']==item_id) & 
#             (df_all_data['store_id']==store_id)].head(30)

In [10]:
# Return a df with all unique combinations of store_id and dept_id
def get_combinations(df_all_data):
    # get all store_id and dept_id combinations
    df_combinations_store_dep = df_all_data[['store_id','dept_id']].drop_duplicates().reset_index(drop=True)

    return df_combinations_store_dep

In [11]:
# Filter df down to only the current store_id and dept_id combination
def filter_df(df_combinations_store_dep, df_all_data, i):
    store_id = df_combinations_store_dep.loc[i, 'store_id']
    dept_id = df_combinations_store_dep.loc[i, 'dept_id']
    ids = df_all_data[(df_all_data['store_id']==store_id) & (df_all_data['dept_id']==dept_id)]['id'].drop_duplicates().values
    filtered_df = df_all_data[(df_all_data['store_id']==store_id) & (df_all_data['dept_id']==dept_id)].reset_index(drop=True)
    filtered_df.reset_index(drop=True, inplace=True) ##################################################????

    # Calculate number of batches
    num_batches = int(np.ceil(len(ids)/MAX_BATCH_SIZE))

    return filtered_df, ids, num_batches

In [12]:
def calc_vocab_size(filtered_df, embedding_dims_max=50):
    vocab_size=[]
    embedding_dims=[]
    # count the unique entries of event_name_1 event_type_1 event_name_2 event_type_2
    # append the number of unique entries to the list vocab_size
    vocab_size.append(len(filtered_df['event_name_1'].unique()))
    vocab_size.append(len(filtered_df['event_type_1'].unique()))
    vocab_size.append(len(filtered_df['event_name_2'].unique()))
    vocab_size.append(len(filtered_df['event_type_2'].unique()))
    
    # loop over all other indices and calculate the embedding dimensions
    for i in range(0, len(vocab_size)):
        embedding_dims.append(int(embedding_dims_max * (vocab_size[i]/max(vocab_size))))

    return vocab_size, embedding_dims

In [13]:
def filtered_df_batches(filtered_df, ids, num_batches, counter):
    # get ids for the current batch
    start_idx = counter * MAX_BATCH_SIZE
    if counter < num_batches - 1:
        end_idx = (counter + 1) * MAX_BATCH_SIZE
        ids_batch = ids[start_idx:end_idx]
    else:
        ids_batch = ids[start_idx:]

    # filter the df for the current batch
    filtered_df_batch = filtered_df[filtered_df['id'].isin(ids_batch)].reset_index(drop=True)

    # Get the number of block items
    num_block_items = len(ids_batch)

    # Get the number of features
    num_features = len(ONCE_ONLY_FEATURES) + len(REPEATED_FEATURES) * num_block_items # Calculate the number of features

    # Get the input shape later on for the model
    input_shape = (DAYS_PER_SEQUENCE, num_features)

    return filtered_df_batch, num_block_items, num_features, input_shape, ids_batch

In [14]:
# create a dataframe that stores only th 5 first items for each day
# indices = np.array([np.arange(start, start + num_block_items) for start in range(0, TEST_END * NUM_ITEMS, NUM_ITEMS)]).flatten()
# df_all_data = df_all_data.iloc[indices]
# df_all_data.reset_index(drop=True, inplace=True)

In [15]:
# all_df = get_whole_data()
# show dataframe with all columns from all_df

In [16]:
# pd.set_option('display.max_columns', None)
# all_df[all_df['d']== 1530].head()

In [17]:
# Normalize numerical columns
def prepare_df(df_all_data):
    # Define categorical and numerical columns
    categorical_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'is_available',
                        'd', 'snap_CA', 'snap_TX', 'snap_WI']
    numerical_cols = ['sell_price']

    # Convert categorical columns to category dtype and encode with cat.codes
    for col in categorical_cols:
        df_all_data[col] = df_all_data[col].astype('category').cat.codes

    # Adjust the event cols
    # 1. Create an encoder instance for each column
    encoders = {col: LabelEncoder() for col in EVENT_COLS}

    # Apply encoding to each column
    for col, encoder in encoders.items():
        df_all_data[col] = encoder.fit_transform(df_all_data[col].astype(str))

    # Normalize numerical columns
    scaler_numerical = MinMaxScaler()
    df_all_data[numerical_cols] = scaler_numerical.fit_transform(df_all_data[numerical_cols].astype(np.float32))

    # scaler_target = MinMaxScaler() #not used any more
    # df_all_data[SALES_AMOUNT_COLS] = scaler_target.fit_transform(df_all_data[SALES_AMOUNT_COLS].astype(np.float64))
    df_all_data[SALES_AMOUNT_COLS] = df_all_data[SALES_AMOUNT_COLS].apply(np.log1p)

    return df_all_data#, scaler_target

In [18]:
def train_test_split(df_all_data):
    # For training split up between train and validation dataset, else use all for training and create test dataset
    if STATUS=='training':
        df_train = df_all_data[df_all_data['d'] < TRAIN_END].reset_index(drop=True)
        df_val   = df_all_data[(df_all_data['d'] >= TRAIN_END - DAYS_PER_SEQUENCE) & (df_all_data['d'] < VAL_END)].reset_index(drop=True) #more than 28 days because of the time_steps shift
        df_test  = None
        
    elif STATUS=='production':
        df_train = df_all_data[df_all_data['d'] < VAL_END].reset_index(drop=True)
        df_test  = df_all_data[(df_all_data['d'] >= VAL_END - DAYS_PER_SEQUENCE)   & (df_all_data['d'] < TEST_END)].reset_index(drop=True) #more than 28 days because of the time_steps shift
        df_val   = None

    # Delete df_all_data to free up memory as data is now stored in df_train, df_val and df_test
    del df_all_data

    return df_train, df_val, df_test

In [19]:
### Create x and y in one go without the generator version autogeneration ###
def create_x_y(df, num_block_items):
    length_days = len(df) // num_block_items
    x = []
    y = []
    events = []

    for i in range(0, length_days - DAYS_PER_SEQUENCE):
        start_ind = i * num_block_items
        end_ind = start_ind + num_block_items * (DAYS_PER_SEQUENCE)  # predict the next day after the sequence

        # Extract once-only features for all days in the sequence at once
        once_features = df.iloc[start_ind:end_ind:num_block_items][ONCE_ONLY_FEATURES].to_numpy()

        # Get event columns
        event_features = df.iloc[start_ind:end_ind:num_block_items][EVENT_COLS].to_numpy()

        # Extract repeated features for all items and days at once
        repeated_features_stack = df.iloc[start_ind:end_ind][REPEATED_FEATURES].to_numpy() # 210,000 items, 10 features

        # Reshape to a 3D array: 7 days, 30,000 items per day, 10 features
        reshaped_3d = repeated_features_stack.reshape(DAYS_PER_SEQUENCE, num_block_items, len(REPEATED_FEATURES))

        # Reshape to a 2D array: 7 days, 30,000 items * 10 features each
        final_array = reshaped_3d.reshape(DAYS_PER_SEQUENCE, -1)

        # Combine once-only and repeated features
        batch_sequences = np.concatenate((once_features, final_array), axis=1)

        # Extract targets
        batch_targets = df.iloc[end_ind:end_ind + num_block_items][[TARGET_COL]].to_numpy().flatten()

        # Append to x, y and events
        x.append(batch_sequences)
        events.append(event_features)
        y.append(batch_targets)


    return np.array(x), np.array(events), np.array(y)

In [20]:
### Use for batch generation input to model ###
def lstm_data_generator(df, num_block_items):
    length_days = len(df) // num_block_items  # 1941 days
    while True:
        for i in range(0, length_days - DAYS_PER_SEQUENCE):
            start_ind = i * num_block_items
            end_ind = start_ind + num_block_items * (DAYS_PER_SEQUENCE)  # predict the next day after the sequence

            # Extract once-only features for all days in the sequence at once
            once_features = df.iloc[start_ind:end_ind:num_block_items][ONCE_ONLY_FEATURES].to_numpy() # 0,5,10,...295 --> len(once_features)=DAYS_PER_SEQUENCE (60); [3 cols]
            # once_features = np.tile(once_features, (NUM_ITEMS, 1, 1)).transpose(1, 0, 2)

            # Extract repeated features for all items and days at once
            repeated_features_stack = df.iloc[start_ind:end_ind][REPEATED_FEATURES].to_numpy() # 0:300 --> len(repeated_features_stack)=300 ;[3 cols]

            # Reshape to a 3D array: 60 days, 5 items ,3 repeated features
            reshaped_3d = repeated_features_stack.reshape(DAYS_PER_SEQUENCE, num_block_items, len(REPEATED_FEATURES))

            # Reshape to a 2D array: 60 days,  5 items * 3 features each (15)
            final_array = reshaped_3d.reshape(DAYS_PER_SEQUENCE, -1)

            # Combine once-only and repeated features
            batch_sequences = np.concatenate((once_features, final_array), axis=1)

            # Reshape batch_sequences to match LSTM input shape
            batch_sequences = batch_sequences.reshape(1, DAYS_PER_SEQUENCE, -1)

            # Extract targets
            batch_targets = df.iloc[end_ind:end_ind + num_block_items][[TARGET_COL]].to_numpy().flatten()

            # Yield the batch
            yield batch_sequences, batch_targets

In [21]:
# Get the training data and labels array for the LSTM model
def get_x_and_y(df_train, df_val, df_test, num_block_items):
    # For generator use:
    # train_generator = lstm_data_generator(df_train)
    # val_generator = lstm_data_generator(df_val)

    # For single batch input use:
    train_x, train_event_x, train_y = create_x_y(df_train, num_block_items)

    if STATUS=='training':
        val_x, val_event_x, val_y = create_x_y(df_val, num_block_items)
        test_x, test_event_x, test_y = None, None, None
    elif STATUS=='production': 
        test_x, test_event_x, test_y = create_x_y(df_test, num_block_items)
        val_x, val_event_x, val_y = None, None, None

    # df_train not needed anymore
    del df_train

    return train_x, train_event_x, train_y, val_x, val_event_x, val_y, test_x, test_event_x, test_y

In [22]:
# Custom RMSE loss function
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [23]:
class ResetStatesCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        self.model.reset_states()

In [24]:
# When transfer learning is used, the model should be recompiled with a new, lower learning rate which this function does
def prepare_model_tl(model, new_learning_rate, frozen_layers):
    # Instantiate a new optimizer with the desired learning rate
    new_optimizer = Adam(learning_rate=new_learning_rate)

    if frozen_layers:
        # Freeze the layers
        for layer in frozen_layers:
            model.layers[layer].trainable = False

    # Recompile the model with the new optimizer
    model.compile(optimizer=new_optimizer, loss=loss, metrics=metrics)

    return model

In [25]:
def model_training(model, _train_x, train_event_x, train_y, _val_x, val_event_x, val_y, epochs, use_embeddings_events):
    # Training the model in batches
    # history = model.fit(x=train_generator,
    #                      steps_per_epoch=TRAIN_DUR,  # total number of sequences in the training set
    #                      validation_data=val_generator,
    #                      validation_steps=VAL_DUR,  # total number of sequences in the validation set
    #                      epochs=epochs,
    #                      callbacks=[ResetStatesCallback()])

    # Train in one go
    if STATUS=='training':
        if use_embeddings_events:
            train_x = [_train_x, train_event_x[:,:,0], train_event_x[:,:,1], train_event_x[:,:,2], train_event_x[:,:,3]]
            val_x   = [_val_x, val_event_x[:,:,0], val_event_x[:,:,1], val_event_x[:,:,2], val_event_x[:,:,3]]
        else:
            train_x = _train_x
            val_x = _val_x

        history = model.fit(x=train_x,  # Entire training dataset
                        y=train_y,  # Corresponding training labels
                        validation_data=(val_x, val_y),  # Entire validation dataset and labels
                        epochs=epochs,
                        batch_size=batch_size)
        
    elif STATUS=='production':
        if use_embeddings_events:
            train_x = [_train_x, train_event_x[:,:,0], train_event_x[:,:,1], train_event_x[:,:,2], train_event_x[:,:,3]]
        else:
            train_x = _train_x

        history = model.fit(x=train_x,  # Entire training dataset
                        y=train_y,  # Corresponding training labels
                        epochs=epochs,
                        batch_size=batch_size)
        
    return model, history

In [26]:
# Evaluation for generator batches
def test_eval(val_generator, model, scaler_target):
    x, y = next(val_generator)
    
    prediction_original = model.predict(x)

    true_array = scaler_target.inverse_transform(y).flatten()
    predicted_array = scaler_target.inverse_transform(prediction_original)[0]
    
    d = {"true_array": true_array, "predicted_array": predicted_array}
    df = pd.DataFrame(d)
    df['predicted_array_rounded'] = df['predicted_array'].round().astype(int)
    df['Difference'] = df['true_array'] - df['predicted_array']

    print(df)

In [27]:
def eval(predictions_original, predictions_normalized, val_y, ids_batch):
    # df_eval = pd.DataFrame(columns=['day', 'normalized', 'prediction', 'actual'])
    df_eval = pd.DataFrame()

    # fill the dataframe with prediction values
    # fill df_eval['id'] with 28 times all ids_batch values
    df_eval['id']  = ids_batch.tolist()*TEST_DUR
    df_eval['day'] = [i for i in range(1, TEST_DUR+1) for _ in range(len(predictions_normalized[1]))]
    
    df_eval['actual_normalized'] = val_y.flatten()
    df_eval['pred_normalized'] = predictions_normalized.flatten()
    df_eval['difference_norm'] = df_eval['actual_normalized'] - df_eval['pred_normalized']

    df_eval['actual_inv'] = np.expm1(val_y).round(0).astype(int).flatten()
    df_eval['pred_inv'] = predictions_original.flatten()
    df_eval['difference_inv'] = df_eval['actual_inv'] - df_eval['pred_inv']

    return df_eval

In [28]:
################################### Function to forecast the next 28 days (This function for case all data in one batch) ###################################
def rolling_forecast(model, df_test, df_val, test_x, test_event_x, test_y , val_x, val_y, val_event_x, num_features, num_block_items): #scaler_target
    # Set the df_copy, x_copy and y_copy to the correct dataset
    if STATUS=='production':
        df_copy = df_test.copy()
        x_copy = test_x.copy()
        y_copy = test_y.copy()
        events_copy = test_event_x.copy()    
    
    elif STATUS=='training':
        df_copy = df_val.copy()
        x_copy = val_x.copy()
        y_copy = val_y.copy()
        events_copy = val_event_x.copy()

    # return x_copy, events_copy --> (28,28,1259); (28, 28, 4)

    # Predict the next 28 days
    for i in range(TEST_DUR):
        prediction_normalized = model.predict([x_copy[i].reshape(1, DAYS_PER_SEQUENCE, num_features)] + 
                                              [events_copy[i][:,j].reshape(1, DAYS_PER_SEQUENCE) for j in range(EVENT_LEN)], verbose=0).flatten() 

        # Impractical to adjust the prepared array, so we will update the df_test copy and use it to create a new array with the updated prediction values
        start_idx = DAYS_PER_SEQUENCE*num_block_items+(i*num_block_items)
        end_idx = start_idx + num_block_items - 1
        df_copy.loc[start_idx:end_idx, TARGET_COL] = prediction_normalized

        # Create new df for x and y
        x_copy, events_copy, _ = create_x_y(df_copy, num_block_items)

        # Update the y array with the new prediction
        y_copy[i] = prediction_normalized
    
    # Inverse transform the predictions
    predictions_normalized = y_copy
    # predictions_original = scaler_target.inverse_transform(y_copy).round(0).astype(int)
    predictions_original = np.expm1(y_copy).round(0).astype(int)

    # Make sure no negative values are returned
    predictions_original[predictions_original < 0] = 0
        
    return predictions_original, predictions_normalized 
#########################################################################################################

In [29]:
# Create a DataFrame for predictions
def prepare_fc_to_file(forecast_df, forecast_array, ids):
    # Transpose predictions to match the sample submission format
    forecast_array = forecast_array.T

    # Create array to write to df
    forecast_array = np.concatenate((ids.reshape(len(ids),1), forecast_array), axis=1)

    # Create a DataFrame for your predictions
    forecast_tmp_df = pd.DataFrame(forecast_array, columns=['id'] + [f'F{i+1}' for i in range(28)])

    # concatenate forecast to forecast_df
    forecast_df = pd.concat([forecast_df, forecast_tmp_df], axis=0, ignore_index=True)

    return forecast_df

In [30]:
def write_to_csv(forecast_df, dir):
    # Get validation data
    val_df = pd.read_pickle(VALIDATION_DATA)

    # Combine forecast with validation data
    forecast_df = pd.concat([val_df, forecast_df], axis=0, ignore_index=True)

    # Save the forecast to a csv file
    forecast_df.to_csv(dir, index=False)

In [31]:
# def tweedie_loss_func(p):
#     def tweedie_loglikelihood(y, y_hat):
#         loss = - y * tf.pow(y_hat, 1 - p) / (1 - p) + \
#                tf.pow(y_hat, 2 - p) / (2 - p)
#         return tf.reduce_mean(loss)
#     return tweedie_loglikelihood


def tweedie_loss_func(p):
    def loss(y_true, y_pred):
        # Ensure predictions are strictly positive
        epsilon = 1e-10
        y_pred = tf.maximum(y_pred, epsilon)

        # Tweedie loss calculation
        loss = -y_true * tf.pow(y_pred, 1-p) / (1-p) + \
               tf.pow(y_pred, 2-p) / (2-p)
        return tf.reduce_mean(loss)
    return loss

In [32]:
# Neu: Architecture to setup when predicting single day steps ahead and not using the repeat vector
def create_lstm_model(input_shape, num_block_items):
   model = Sequential([
      LSTM(units=80, activation='relu', return_sequences=True, recurrent_dropout=0.1, input_shape=input_shape),
      Dropout(0.3),
      LSTM(units=40, activation='relu', return_sequences=False, recurrent_dropout=0.1),
      Dropout(0.3),
      # LSTM(units=40, activation='tanh', return_sequences=False, recurrent_dropout=0.1),
      # Dropout(0.1),
      Dense(units=num_block_items, activation='relu'), # activation='relu', 'softmax; Final Dense layer for output
      Reshape((num_block_items, 1))]) # Reshape the output to be (number of items)

   model.compile(optimizer=initial_optimizer, loss=loss, metrics=metrics)

   # For tracking purposes: check the models parameters
   # model.summary()

   return model

In [33]:
# for each store_id and dept_id call get whole data, filter for store_id and dept_id
def lstm_pipeline(verbose, use_embeddings_events):
    df_all_data = get_whole_data()

    # Get all store_id and dept_id combinations
    df_combinations_store_dep = get_combinations(df_all_data)

    # Create empty dataframe to store the forecast
    forecast_df = pd.DataFrame(columns=['id'] + [f'F{i+1}' for i in range(TEST_DUR)])

    # define the number of loops
    num_loop = 1 if verbose == 1 else len(df_combinations_store_dep)

    # Loop over all store_id and dept_id combinations, create a model, train it, create the prediction and save it to a file
    for i in range(0, num_loop):
        print(f'Processing {i+1} of {len(df_combinations_store_dep)}: store_id {df_combinations_store_dep.loc[i, "store_id"]} and dept_id {df_combinations_store_dep.loc[i, "dept_id"]}')
        # Filter df down to only the current store_id and dept_id combination
        filtered_df, ids, num_batches = filter_df(df_combinations_store_dep, df_all_data, i)

        # Calculate the vocab size for the embedding layers later when model is defined
        vocab_sizes, embedding_dims = calc_vocab_size(filtered_df) # Funktioniert nur, wenn num_batches 1 ist, sonst muss komplexere Berechnung innerhalb des loops erfolgen

        # Loop over all batches
        for counter in range(1):#num_batches):
            print(f'Processing batch {counter+1} of {num_batches}')
                
            # Create batches for the current store_id and dept_id combination to avoid memory issues and curse of dimensionality
            filtered_df_batch, num_block_items, num_features, input_shape, ids_batch = filtered_df_batches(filtered_df, ids, num_batches, counter)

            print(f'Number of ids in this batch: {len(ids_batch)}')
                
            # Prepare the data for training
            filtered_df_batch = prepare_df(filtered_df_batch) #filtered_df_batch, scaler_target

            # return filtered_df_batch


            # Split the data into train, validation and test set
            df_train, df_val, df_test = train_test_split(filtered_df_batch)

            # return df_train

            # Create training, validation and test data arrays from the dataframes
            train_x, train_event_x, train_y, val_x, val_event_x, val_y, test_x, test_event_x, test_y = get_x_and_y(df_train, df_val, df_test, num_block_items)
            # --> train event and val event return (1857, 28, 4) vs (28, 28, 4)         

            # If this is the first batch create the model, for subsequent batches retrain the current model with smaller learning rate
            if counter == 0 or counter == num_batches - 1:
                # Create the model
                model = create_lstm_model_embedding(input_shape, num_block_items, vocab_sizes, embedding_dims)

                # model = create_lstm_model(input_shape, num_block_items)
                epochs = initial_epochs
            else:
                model = prepare_model_tl(model, subsequent_lr, [])
                epochs = subsequent_epochs

            # return train_x, train_event_x, train_y

            # Training the model
            model_trained, history = model_training(model, train_x, train_event_x, train_y, 
                                                    val_x, val_event_x, val_y, 
                                                    epochs, use_embeddings_events)

            # Create the forecast
            # x, events = rolling_forecast(model_trained, df_test, df_val, test_x, test_event_x, test_y, val_x, val_y, val_event_x, scaler_target, num_features, num_block_items)
            predictions_original, predictions_normalized = rolling_forecast(model_trained, df_test, df_val, test_x, test_event_x, test_y, val_x, val_y, val_event_x, num_features, num_block_items)

            # Testing the model
            if verbose == 1:
                # Call eval function to get the evaluation dataframe and some feeling for the results
                # df_eval = eval(val_x, val_event_x, val_y, model_trained, num_features)#, scaler_target)
                df_eval = eval(predictions_original, predictions_normalized, val_y, ids_batch)#, scaler_target)
            
                # Test output for generator
                # test_data = test_eval(val_generator, model_trained, scaler_target)

            forecast_df = prepare_fc_to_file(forecast_df, predictions_original, ids_batch)
            print("####################################################\n")

    if verbose == 0:
        write_to_csv(forecast_df, sub_dir + 'sample_submission.csv')
        return forecast_df
    
    if verbose == 1:
        return df_eval

In [34]:
# Model parameters
initial_epochs = 6
subsequent_epochs = 6
batch_size = 500
# Learning rate schedule
initial_lr = 0.01
decay_steps = 1000
alpha = 0.001  # Final learning rate
lr_schedule = tf.keras.experimental.CosineDecay(
    initial_learning_rate=initial_lr,
    decay_steps=decay_steps,
    alpha=alpha  # Minimum learning rate value as a fraction of initial_learning_rate.
)
subsequent_lr = 0.005 # Reduce learning rate by factor of 10 for transfer learning

clipvalue = 0.5
initializer = GlorotNormal(seed=42)

# Model compile parameters
loss = tweedie_loss_func(p=1.2) #tf.keras.losses.MeanAbsoluteError() #rmse
initial_optimizer = Adam(learning_rate=lr_schedule, clipvalue=clipvalue)
metrics = tf.keras.metrics.MeanAbsoluteError()

In [35]:
# Use functional API to create a model
def create_lstm_model_embedding(numerical_input_shape, num_block_items, vocab_sizes, embedding_dims): 
    numerical_input = Input(shape=numerical_input_shape, name='numerical_input')
    event_input = [Input(shape=(DAYS_PER_SEQUENCE,), name=f'event_input_{i}') for i in range(1, 5)]

    cat_embeddings = [Embedding(input_dim=vocab_sizes[i], output_dim=embedding_dims[i], input_length=DAYS_PER_SEQUENCE, embeddings_initializer=initializer)(event_input[i]) for i in range(0, 4)]

    # Combine numerical input and embeddings
    combined_input = concatenate([numerical_input] + cat_embeddings)

    # LSTM layer
    lstm_out = Bidirectional(LSTM(units=32, activation='tanh', return_sequences=True, recurrent_dropout=0.2, kernel_regularizer=l2(0.1), kernel_initializer=initializer))(combined_input)
    # dropout = Dropout(0.2)(lstm_out)
    # lstm_out = Bidirectional(LSTM(units=16, activation='tanh', return_sequences=True, recurrent_dropout=0.1, kernel_regularizer=l2(0.01)))(dropout)
    # dropout = Dropout(0.2)(lstm_out)
    attention_out = SeqSelfAttention(attention_activation='sigmoid', kernel_initializer=initializer)(lstm_out)

    # Aggregate sequence information
    pooled_output = GlobalAveragePooling1D()(attention_out)

    # Output layer
    output = Dense(num_block_items, kernel_regularizer=l2(0.1), kernel_initializer=initializer)(pooled_output)





    # Create and compile the model
    model = Model(inputs=[numerical_input] + event_input, outputs=output)

    model.compile(optimizer=initial_optimizer, loss=loss, metrics=metrics)

    return model

In [36]:
forecast_df = lstm_pipeline(verbose=1, use_embeddings_events=True)
# forecast_df = lstm_pipeline(verbose=1, use_embeddings_events=True)

Processing 1 of 70: store_id CA_1 and dept_id HOBBIES_1
Processing batch 1 of 1
Number of ids in this batch: 416
Epoch 1/6


2024-02-26 16:15:33.917469: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
####################################################



In [37]:
# forecast_df
forecast_df[forecast_df['id']=='HOBBIES_1_005_CA_1_evaluation'].head(30)
# forecast_df.iloc[4::MAX_BATCH_SIZE,]

Unnamed: 0,id,day,actual_normalized,pred_normalized,difference_norm,actual_inv,pred_inv,difference_inv
4,HOBBIES_1_005_CA_1_evaluation,1,0.693147,0.24403,0.449117,1,0,1
420,HOBBIES_1_005_CA_1_evaluation,2,0.0,0.244013,-0.244013,0,0,0
836,HOBBIES_1_005_CA_1_evaluation,3,1.609438,0.244115,1.365323,4,0,4
1252,HOBBIES_1_005_CA_1_evaluation,4,1.609438,0.244671,1.364767,4,0,4
1668,HOBBIES_1_005_CA_1_evaluation,5,0.0,0.245054,-0.245054,0,0,0
2084,HOBBIES_1_005_CA_1_evaluation,6,0.693147,0.244986,0.448161,1,0,1
2500,HOBBIES_1_005_CA_1_evaluation,7,1.609438,0.245732,1.363706,4,0,4
2916,HOBBIES_1_005_CA_1_evaluation,8,0.0,0.244995,-0.244995,0,0,0
3332,HOBBIES_1_005_CA_1_evaluation,9,0.693147,0.245212,0.447935,1,0,1
3748,HOBBIES_1_005_CA_1_evaluation,10,0.0,0.24545,-0.24545,0,0,0
