In [1]:
# Setting to adjust before each run:
MODEL_NAME = 'V3_ohne_Cat_features_block_items'
CODE_ENV = 'local' #'kaggle', 'aws', 'local'
STATUS = 'training' #'production' 

In [2]:
#Import data handling libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Input, LSTM, Dense, Masking, RepeatVector, Dropout, Reshape
from keras.optimizers import Adam
from keras.metrics import RootMeanSquaredError
from keras import backend as K
from keras.callbacks import Callback
import tensorflow as tf

In [3]:
# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.test.is_built_with_cuda())

Num GPUs Available:  0
False


In [4]:
#Specify directories
if CODE_ENV=='local':
    ###local###
    #get parent folder of current directory
    parent_dir = '/Users/mf/Desktop/CS/Studies/7_Final_Project/Kaggle_M5PointPrediction'

    #Directory resources
    res_dir = parent_dir + '/res/'
    src_dir = parent_dir + '/src/'
    prc_dir = src_dir + 'processed_data/' # Processed data directory with pickled dataframes
    sub_dir = src_dir + 'submissions/' # Directory to save submission files

if CODE_ENV=='kaggle':
    ###On Kaggle###
    res_dir = '/kaggle/input/m5-forecasting-accuracy/'
    prc_dir = '/kaggle/input/processed-data-v3/'

if CODE_ENV=='aws':
    parent_dir = '/home/ubuntu/projects/Kaggle_M5PointPrediction'
    res_dir = parent_dir + '/res/'
    src_dir = parent_dir + '/src/'
    prc_dir = src_dir + 'processed_data/' # Processed data directory with pickled dataframes
    sub_dir = src_dir + 'submissions/' # Directory to save submission files

In [5]:
# Create variables
VALIDATION_DATA  = prc_dir +'df_1.pkl' # Validation data
BASE      = prc_dir +'df_2.pkl' # Base data
CALENDAR  = prc_dir +'df_3.pkl' # Calendar data
NUM_ITEMS = 30490 # Number of items per each day
DAYS_PER_SEQUENCE = 10  # Length of the sequence
MAX_BATCH_SIZE = 100 # Maximum number of ids to be used in each batch to avoid memory issues and curse of dimensionality
TARGET_COL = 'sales_amount'
# REPEATED_FEATURES = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'sales_amount', 'sell_price', 'is_available'] # List to hold all feature columns that are used for each item
REPEATED_FEATURES = ['sales_amount', 'sell_price', 'is_available'] # List to hold all feature columns that are used for each item
# ONCE_ONLY_FEATURES = ['d', 'wday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'mday', 'week', 'month', 'year', 'snap_CA', 'snap_TX', 'snap_WI'] # List to hold feature columns that are not repeated for each item
ONCE_ONLY_FEATURES = ['snap_CA', 'snap_TX', 'snap_WI', 'mday_normalized', 'month_sin', 'month_cos', 'wday_sin', 'wday_cos', 'week_sin', 'week_cos', 'year_normalized'] # List to hold feature columns that are not repeated for each item

In [6]:
# Set test_end to 1969 in case of production
if STATUS=='production':
    TEST_END = 1969
elif STATUS=='training':
    TEST_END = 1941

# Splitting the data in train, validation and test set; days are now 0 based, so have to shift by 1
# Define duration in days of each set
VAL_DUR   = 28
TEST_DUR  = 28

# Define end days of training set for each set
VAL_END   = TEST_END - TEST_DUR
TRAIN_END = VAL_END - VAL_DUR # 1885 -> Train only until the 28 days before the end of the data

# Finally define duration in days for the train set
TRAIN_DUR = TRAIN_END - DAYS_PER_SEQUENCE# Depends on whether the whole dataset is used or last the 28 days for validation 

In [7]:
# Read in df_train_conv from pickle file
def get_whole_data():
    df_all_data = pd.concat([pd.read_pickle(BASE),
           pd.read_pickle(CALENDAR)], 
           axis=1)
    return df_all_data

In [8]:
# Return a df with all unique combinations of store_id and dept_id
def get_combinations(df_all_data):
    # get all store_id and dept_id combinations
    df_combinations_store_dep = df_all_data[['store_id','dept_id']].drop_duplicates().reset_index(drop=True)

    return df_combinations_store_dep

In [9]:
# Filter df down to only the current store_id and dept_id combination
def filter_df(df_combinations_store_dep, df_all_data, i):
    store_id = df_combinations_store_dep.loc[i, 'store_id']
    dept_id = df_combinations_store_dep.loc[i, 'dept_id']
    ids = df_all_data[(df_all_data['store_id']==store_id) & (df_all_data['dept_id']==dept_id)]['id'].drop_duplicates().values
    filtered_df = df_all_data[(df_all_data['store_id']==store_id) & (df_all_data['dept_id']==dept_id)].reset_index(drop=True)
    filtered_df.reset_index(drop=True, inplace=True) ##################################################????

    # Calculate number of batches
    num_batches = int(np.ceil(len(ids)/MAX_BATCH_SIZE))

    return filtered_df, ids, num_batches

In [10]:
def filtered_df_batches(filtered_df, ids, num_batches, counter):
    # get ids for the current batch
    start_idx = counter * MAX_BATCH_SIZE
    if counter < num_batches - 1:
        end_idx = (counter + 1) * MAX_BATCH_SIZE
        ids_batch = ids[start_idx:end_idx]
    else:
        ids_batch = ids[start_idx:]

    # filter the df for the current batch
    filtered_df_batch = filtered_df[filtered_df['id'].isin(ids_batch)].reset_index(drop=True)

    # Get the number of block items
    num_block_items = len(ids_batch)

    # Get the number of features
    num_features = len(ONCE_ONLY_FEATURES) + len(REPEATED_FEATURES) * num_block_items # Calculate the number of features

    # Get the input shape later on for the model
    input_shape = (DAYS_PER_SEQUENCE, num_features)

    return filtered_df_batch, num_block_items, num_features, input_shape, ids_batch

In [11]:
# create a dataframe that stores only th 5 first items for each day
# indices = np.array([np.arange(start, start + num_block_items) for start in range(0, TEST_END * NUM_ITEMS, NUM_ITEMS)]).flatten()
# df_all_data = df_all_data.iloc[indices]
# df_all_data.reset_index(drop=True, inplace=True)

In [12]:
# Normalize numerical columns
def prepare_df(df_all_data):
    # Define categorical and numerical columns
    categorical_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'is_available',
                        'd', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 
                        'snap_CA', 'snap_TX', 'snap_WI']
    numerical_cols = ['sell_price']

    # Convert categorical columns to category dtype and encode with cat.codes
    for col in categorical_cols:
        df_all_data[col] = df_all_data[col].astype('category').cat.codes

    # Normalize numerical columns
    scaler_numerical = MinMaxScaler()
    df_all_data[numerical_cols] = scaler_numerical.fit_transform(df_all_data[numerical_cols].astype(np.float32))

    scaler_target = MinMaxScaler()
    df_all_data[TARGET_COL] = scaler_target.fit_transform(df_all_data[[TARGET_COL]].astype(np.float64))

    return df_all_data, scaler_target

In [13]:
def train_test_split(df_all_data):
    # For training split up between train and validation dataset, else use all for training and create test dataset
    if STATUS=='training':
        df_train = df_all_data[df_all_data['d'] < TRAIN_END].reset_index(drop=True)
        df_val   = df_all_data[(df_all_data['d'] >= TRAIN_END - DAYS_PER_SEQUENCE) & (df_all_data['d'] < VAL_END)].reset_index(drop=True) #more than 28 days because of the time_steps shift
        df_test  = None
    elif STATUS=='production':
        df_train = df_all_data[df_all_data['d'] < VAL_END].reset_index(drop=True)
        df_test  = df_all_data[(df_all_data['d'] >= VAL_END - DAYS_PER_SEQUENCE)   & (df_all_data['d'] < TEST_END)].reset_index(drop=True) #more than 28 days because of the time_steps shift
        df_val   = None

    # Delete df_all_data to free up memory as data is now stored in df_train, df_val and df_test
    del df_all_data

    return df_train, df_val, df_test

In [14]:
### Create x and y in one go without the generator version autogeneration ###
def create_x_y(df, num_block_items):
    length_days = len(df) // num_block_items
    x = []
    y = []
    for i in range(0, length_days - DAYS_PER_SEQUENCE):
        start_ind = i * num_block_items
        end_ind = start_ind + num_block_items * (DAYS_PER_SEQUENCE)  # predict the next day after the sequence

        # Extract once-only features for all days in the sequence at once
        once_features = df.iloc[start_ind:end_ind:num_block_items][ONCE_ONLY_FEATURES].to_numpy()
        # once_features = np.tile(once_features, (NUM_ITEMS, 1, 1)).transpose(1, 0, 2)

        # Extract repeated features for all items and days at once
        repeated_features_stack = df.iloc[start_ind:end_ind][REPEATED_FEATURES].to_numpy() # 210,000 items, 10 features

        # Reshape to a 3D array: 7 days, 30,000 items per day, 10 features
        reshaped_3d = repeated_features_stack.reshape(DAYS_PER_SEQUENCE, num_block_items, len(REPEATED_FEATURES))

        # Reshape to a 2D array: 7 days, 30,000 items * 10 features each
        final_array = reshaped_3d.reshape(DAYS_PER_SEQUENCE, -1)

        # Combine once-only and repeated features
        batch_sequences = np.concatenate((once_features, final_array), axis=1)

        # Reshape batch_sequences to match LSTM input shape
        # batch_sequences = batch_sequences.reshape(1, DAYS_PER_SEQUENCE, -1)

        # Extract targets
        batch_targets = df.iloc[end_ind:end_ind + num_block_items][[TARGET_COL]].to_numpy().flatten()

        # Append to x and y
        x.append(batch_sequences)
        y.append(batch_targets)

    return np.array(x), np.array(y)

In [15]:
### Use for batch generation input to model ###
def lstm_data_generator(df, num_block_items):
    length_days = len(df) // num_block_items  # 1941 days
    while True:
        for i in range(0, length_days - DAYS_PER_SEQUENCE):
            start_ind = i * num_block_items
            end_ind = start_ind + num_block_items * (DAYS_PER_SEQUENCE)  # predict the next day after the sequence

            # Extract once-only features for all days in the sequence at once
            once_features = df.iloc[start_ind:end_ind:num_block_items][ONCE_ONLY_FEATURES].to_numpy() # 0,5,10,...295 --> len(once_features)=DAYS_PER_SEQUENCE (60); [3 cols]
            # once_features = np.tile(once_features, (NUM_ITEMS, 1, 1)).transpose(1, 0, 2)

            # Extract repeated features for all items and days at once
            repeated_features_stack = df.iloc[start_ind:end_ind][REPEATED_FEATURES].to_numpy() # 0:300 --> len(repeated_features_stack)=300 ;[3 cols]

            # Reshape to a 3D array: 60 days, 5 items ,3 repeated features
            reshaped_3d = repeated_features_stack.reshape(DAYS_PER_SEQUENCE, num_block_items, len(REPEATED_FEATURES))

            # Reshape to a 2D array: 60 days,  5 items * 3 features each (15)
            final_array = reshaped_3d.reshape(DAYS_PER_SEQUENCE, -1)

            # Combine once-only and repeated features
            batch_sequences = np.concatenate((once_features, final_array), axis=1)

            # Reshape batch_sequences to match LSTM input shape
            batch_sequences = batch_sequences.reshape(1, DAYS_PER_SEQUENCE, -1)

            # Extract targets
            batch_targets = df.iloc[end_ind:end_ind + num_block_items][[TARGET_COL]].to_numpy().flatten()

            # Yield the batch
            yield batch_sequences, batch_targets

In [16]:
# Get the training data and labels array for the LSTM model
def get_x_and_y(df_train, df_val, df_test, num_block_items):
    # For generator use:
    # train_generator = lstm_data_generator(df_train)
    # val_generator = lstm_data_generator(df_val)

    # For single batch input use:
    train_x, train_y = create_x_y(df_train, num_block_items)

    if STATUS=='training':
        val_x, val_y = create_x_y(df_val, num_block_items)
        test_x, test_y = None, None
    elif STATUS=='production': 
        test_x, test_y = create_x_y(df_test, num_block_items)
        val_x, val_y = None, None

    # df_train not needed anymore
    del df_train

    return train_x, train_y, val_x, val_y, test_x, test_y

In [17]:
# Custom RMSE loss function
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [18]:
class ResetStatesCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        self.model.reset_states()

In [19]:
def model_training(model, train_x, train_y, val_x, val_y, epochs):
    # Training the model in batches
    # history = model.fit(x=train_generator,
    #                      steps_per_epoch=TRAIN_DUR,  # total number of sequences in the training set
    #                      validation_data=val_generator,
    #                      validation_steps=VAL_DUR,  # total number of sequences in the validation set
    #                      epochs=epochs,
    #                      callbacks=[ResetStatesCallback()])

    # Train in one go
    if STATUS=='training':
        history = model.fit(x=train_x,  # Entire training dataset
                        y=train_y,  # Corresponding training labels
                        validation_data=(val_x, val_y),  # Entire validation dataset and labels
                        epochs=epochs)
    elif STATUS=='production':
        history = model.fit(x=train_x,  # Entire training dataset
                        y=train_y,  # Corresponding training labels
                        epochs=epochs)
        
    return model, history

In [20]:
def eval(val_x, val_y, model, num_features, scaler_target):
    df_eval = pd.DataFrame(columns=['day', 'prediction', 'actual'])
    for i in range(0, len(val_x)):
        # create new dataframe with the current day, the actual value and the prediction
        df_temp = pd.DataFrame({'day': i, 'prediction': model.predict(val_x[i].reshape(1, DAYS_PER_SEQUENCE, num_features), verbose=0).flatten(), 'actual': val_y[i]})
        df_eval = pd.concat([df_eval, df_temp], axis=0, ignore_index=True)
        # new column with the difference between actual and prediction
        df_eval['difference'] = df_eval['actual'] - df_eval['prediction']
        # new columns with inverse transformation of actual and prediction
        df_eval['actual_inv'] = scaler_target.inverse_transform(df_eval[['actual']]).astype(int)
        df_eval['prediction_inv'] = scaler_target.inverse_transform(df_eval[['prediction']]).round(0).astype(int)
        # new columns with the difference between actual and prediction
        df_eval['difference_inv'] = df_eval['actual_inv'] - df_eval['prediction_inv']
    return df_eval

In [21]:
# Evaluation for generator batches
def test_eval(val_generator, model, scaler_target):
    x, y = next(val_generator)
    
    prediction_original = model.predict(x)

    true_array = scaler_target.inverse_transform(y).flatten()
    predicted_array = scaler_target.inverse_transform(prediction_original)[0]
    
    d = {"true_array": true_array, "predicted_array": predicted_array}
    df = pd.DataFrame(d)
    df['predicted_array_rounded'] = df['predicted_array'].round().astype(int)
    df['Difference'] = df['true_array'] - df['predicted_array']

    print(df)

In [22]:
################################### Function to forecast the next 28 days (This function for case all data in one batch) ###################################
def rolling_forecast(model, df_test, df_val, test_x, test_y, val_x, val_y, scaler_target, num_features, num_block_items):
    # Set the df_copy, x_copy and y_copy to the correct dataset
    if STATUS=='production':
        df_copy = df_test.copy()
        x_copy = test_x.copy()
        y_copy = test_y.copy()    
    
    elif STATUS=='training':
        df_copy = df_val.copy()
        x_copy = val_x.copy()
        y_copy = val_y.copy()

    # Predict the next 28 days
    for i in range(TEST_DUR):
        prediction_normalized = model.predict(x_copy[i].reshape(1, DAYS_PER_SEQUENCE, num_features), verbose=0).flatten()
    
        # Impractical to adjust the prepared array, so we will update the df_test copy and use it to create a new array with the updated prediction values
        start_idx = DAYS_PER_SEQUENCE*num_block_items+(i*num_block_items)
        end_idx = start_idx + num_block_items - 1
        df_copy.loc[start_idx:end_idx, TARGET_COL] = prediction_normalized

        # Create new df for x and y
        x_copy, _ = create_x_y(df_copy, num_block_items)

        # Update the y array with the new prediction
        y_copy[i] = prediction_normalized
    
    # Inverse transform the predictions
    predictions_original = scaler_target.inverse_transform(y_copy).round(0).astype(int)

    # Make sure no negative values are returned
    predictions_original[predictions_original < 0] = 0
        
    return predictions_original
#########################################################################################################

In [23]:
# Create a DataFrame for predictions
def prepare_fc_to_file(forecast_df, forecast_array, ids):
    # Transpose predictions to match the sample submission format
    forecast_array = forecast_array.T

    # Create array to write to df
    forecast_array = np.concatenate((ids.reshape(len(ids),1), forecast_array), axis=1)

    # Create a DataFrame for your predictions
    forecast_tmp_df = pd.DataFrame(forecast_array, columns=['id'] + [f'F{i+1}' for i in range(28)])

    # concatenate forecast to forecast_df
    forecast_df = pd.concat([forecast_df, forecast_tmp_df], axis=0, ignore_index=True)

    return forecast_df

In [24]:
def write_to_csv(forecast_df, dir):
    # Get validation data
    val_df = pd.read_pickle(VALIDATION_DATA)

    # Combine forecast with validation data
    forecast_df = pd.concat([val_df, forecast_df], axis=0, ignore_index=True)

    # Save the forecast to a csv file
    forecast_df.to_csv(dir, index=False)

In [34]:
# Model parameters
epochs = 6
batch_size = 1
lr = 0.003 #lr = 0.0001
clipvalue = 0.5

# Model compile parameters
loss = rmse
optimizer = Adam(learning_rate=lr, clipvalue=clipvalue)
metrics = tf.keras.metrics.MeanAbsoluteError()

In [35]:
# Neu: Architecture to setup when predicting single day steps ahead and not using the repeat vector
def create_lstm_model(input_shape, num_block_items):
   model = Sequential([
      LSTM(units=70, activation='tanh', return_sequences=True, recurrent_dropout=0.1, input_shape=input_shape),
      Dropout(0.1),
      # LSTM(units=10, activation='tanh', return_sequences=True, recurrent_dropout=0.1),
      # Dropout(0.1),
      LSTM(units=50, activation='tanh', return_sequences=False, recurrent_dropout=0.1),
      Dropout(0.1),
      Dense(units=num_block_items, activation='tanh'), # activation='relu', 'softmax; Final Dense layer for output
      Reshape((num_block_items, 1))]) # Reshape the output to be (number of items)

   model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

   # For tracking purposes: check the models parameters
   # model.summary()

   return model

In [36]:
# for each store_id and dept_id call get whole data, filter for store_id and dept_id
def lstm_pipeline(verbose):
    df_all_data = get_whole_data()

    # Get all store_id and dept_id combinations
    df_combinations_store_dep = get_combinations(df_all_data)

    # Create empty dataframe to store the forecast
    forecast_df = pd.DataFrame(columns=['id'] + [f'F{i+1}' for i in range(28)])

    # define the number of loops
    num_loop = 1 if verbose == 1 else len(df_combinations_store_dep)

    # Loop over all store_id and dept_id combinations, create a model, train it, create the prediction and save it to a file
    for i in range(0, num_loop):
        print(f'Processing {i+1} of {len(df_combinations_store_dep)}: store_id {df_combinations_store_dep.loc[i, "store_id"]} and dept_id {df_combinations_store_dep.loc[i, "dept_id"]}')
        # Filter df down to only the current store_id and dept_id combination
        filtered_df, ids, num_batches = filter_df(df_combinations_store_dep, df_all_data, i)

        # Loop over all batches
        for counter in range(num_batches):
            print(f'Processing batch {counter+1} of {num_batches}')
                
            # Create batches for the current store_id and dept_id combination to avoid memory issues and curse of dimensionality
            filtered_df_batch, num_block_items, num_features, input_shape, ids_batch = filtered_df_batches(filtered_df, ids, num_batches, counter)

            # Prepare the data for training
            filtered_df_batch, scaler_target = prepare_df(filtered_df_batch)

            # Split the data into train, validation and test set
            df_train, df_val, df_test = train_test_split(filtered_df_batch)

            # Create training, validation and test data arrays from the dataframes
            train_x, train_y, val_x, val_y, test_x, test_y = get_x_and_y(df_train, df_val, df_test, num_block_items)

            # Create the model
            model = create_lstm_model(input_shape, num_block_items)

            # Train the model
            model_trained, history = model_training(model, train_x, train_y, val_x, val_y, epochs)

            # Testing the model
            if verbose == 1:
                # Call eval function to get the evaluation dataframe and some feeling for the results
                df_eval = eval(val_x, val_y, model_trained, num_features, scaler_target)

                # Test output for generator
                # test_data = test_eval(val_generator, model_trained, scaler_target)

            if verbose == 0:
                # Create the forecast
                predictions_original = rolling_forecast(model_trained, df_test, df_val, test_x, test_y, val_x, val_y, scaler_target, num_features, num_block_items)

                forecast_df = prepare_fc_to_file(forecast_df, predictions_original, ids_batch)
            print("####################################################\n")

    if verbose == 0:
        write_to_csv(forecast_df, sub_dir + 'sample_submission.csv')

    return forecast_df

In [37]:
forecast_df = lstm_pipeline(verbose=0)

Processing 1 of 70: store_id CA_1 and dept_id HOBBIES_1
Processing batch 1 of 3
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
####################################################

Processing batch 2 of 3
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
####################################################

Processing batch 3 of 3
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
####################################################

Processing 2 of 70: store_id CA_1 and dept_id HOBBIES_2
Processing batch 1 of 1
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
####################################################

Processing 3 of 70: store_id CA_1 and dept_id HOUSEHOLD_1
Processing batch 1 of 3
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
####################################################

Processing batch 2 of 3
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
####################################################

Pr

In [38]:
# Test output
forecast_df.head(30)
# every 5h row
#forecast_df.iloc[4::416,]
#how many rows with day = 0

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_evaluation,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,HOBBIES_1_002_CA_1_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_evaluation,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
4,HOBBIES_1_005_CA_1_evaluation,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
5,HOBBIES_1_006_CA_1_evaluation,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
6,HOBBIES_1_007_CA_1_evaluation,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,HOBBIES_1_008_CA_1_evaluation,9,9,9,9,9,9,9,9,9,...,9,9,9,9,9,9,9,9,9,9
8,HOBBIES_1_009_CA_1_evaluation,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
9,HOBBIES_1_010_CA_1_evaluation,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [None]:
# # For testing purposes: check how large on batch is
# # next train_generator
# x, y = next(train_generator)

# # size of memory in mb of x and y
# # print(train_x.nbytes / 1e6)
# # print(train_y.nbytes / 1e6)

# print(train_x.shape)
# print(train_y.shape)
# print(x.shape)
# print(y.shape)

In [None]:
# # Save the model to a specified directory
# if CODE_ENV=='local':
#     ###local###
#     model.save(src_dir + 'models/' + MODEL_NAME + '.h5')
    
# if CODE_ENV=='kaggle':
#     ###On Kaggle###
#     model.save('/kaggle/working/' + MODEL_NAME + '.h5')

# if CODE_ENV=='aws':
#     ###aws###
#     model.save(src_dir + 'models/' + MODEL_NAME + '.h5')

In [None]:
# Start from here if you want to load the model
# from keras.models import load_model

# # Load the model from a specified directory
# if CODE_ENV=='local':
#     ###local###
#     model = load_model(src_dir + 'models/' + MODEL_NAME + '.h5', custom_objects={'rmse': rmse})

# if CODE_ENV=='kaggle':
#     ###On Kaggle###
#     model = load_model('/kaggle/input/v1-model/' + MODEL_NAME + '.h5', custom_objects={'rmse': rmse})

# if CODE_ENV=='aws':
#     ###aws###
#     model.save(src_dir + 'models/' + MODEL_NAME + '.h5', custom_objects={'rmse': rmse})

In [None]:
import matplotlib.pyplot as plt
try:
    # Plot training & validation loss values
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()
except:
    print('No history to plot')

In [None]:
# def prepare_forecast_input(df, DAYS_PER_SEQUENCE, num_items):
#     #df_test starts at 1942-7 which we need take into account
#     # Prepare input data for forecasting
#     forecast_input = []
#     for target_day in range(28):
#         start_idx = target_day * num_items
#         end_idx = start_idx + DAYS_PER_SEQUENCE * num_items
#         sequence = df.iloc[start_idx:end_idx].drop('sales_amount', axis=1).to_numpy()
#         forecast_input.append(sequence)
#     return np.array(forecast_input)


# Custom function for input to prepare forecasts input for model
# def prepare_forecast_input(df, target, model, DAYS_PER_SEQUENCE, num_items):
#     forecast_output = []
#     for target_day in range(28):
#         start_idx = target_day * num_items
#         end_idx = start_idx + DAYS_PER_SEQUENCE * num_items
#         sequence = df.iloc[start_idx:end_idx, : ].drop(target, axis=1).to_numpy()
#         # forecast_output.append(model.predict(sequence))
#         forecast_output.append(model.predict(sequence.reshape(1, sequence.shape[0], sequence.shape[1])))
#     return np.array(forecast_output)#.reshape(-1, 1)
# forecast_output = prepare_forecast_input(df_test, TARGET_COL, model, DAYS_PER_SEQUENCE, NUM_ITEMS)
#forecasts_original = scaler.inverse_transform(forecast_output)



In [None]:
# Assuming df_all_data contains all data up to day 1941
# forecast_input = prepare_forecast_input(df_test, DAYS_PER_SEQUENCE, NUM_ITEMS)

# Generate forecasts
# forecasts = model.predict(forecast_input)
# forecasts_original = scaler.inverse_transform(forecasts)

# forecasts_original now contains the predicted sales amounts for days 1942 to 1969


In [None]:
# Prepare input for forecasts
# I cannot use the custom lstm_data_generator
# Prepare 7 day slices each shifted by one day
def prepare_forecast_input(df, DAYS_PER_SEQUENCE, target_col):
    forecast_input = []
    for i in range(0, len(df)//NUM_ITEMS): #i=0; 1, 2, 3, ..., 35?
        if i + DAYS_PER_SEQUENCE < (len(df)-1)//NUM_ITEMS: #7, 8, 9, 10, ...
            start_idx = i*NUM_ITEMS
            end_idx   = start_idx + NUM_ITEMS * DAYS_PER_SEQUENCE
            sequence  = df.iloc[start_idx : end_idx, :].drop(target_col, axis=1).to_numpy()
            forecast_input.append(sequence)
    return np.array(forecast_input)

# predict_array = prepare_forecast_input(df=df_test, DAYS_PER_SEQUENCE=DAYS_PER_SEQUENCE, target_col=TARGET_COL)

In [None]:
# Now, let's define a function to calculate WRMSSE by calculating the RMSSE for each series and then multiplying by the weights and summing them up. 
def calculate_weights(sales_data, last_n_days=28):
    # sales_data: DataFrame with columns ['item_id', 'day', 'sales']
    # Sum sales for each item over the last_n_days
    item_sales = sales_data[sales_data['day'] > sales_data['day'].max() - last_n_days].groupby('item_id')['sales'].sum()
    # Total sales for all items
    total_sales = item_sales.sum()
    # Calculate weights
    weights = item_sales / total_sales
    return weights

def rmsse(y_true, y_pred, h, y_train):
    numerator = np.sum((y_true - y_pred) ** 2) / h
    denominator = np.sum(np.diff(y_train) ** 2) / (len(y_train) - 1) # np.diff to calc the diff for consecutive elements
    return np.sqrt(numerator / denominator)

def wrmsse(y_trues, y_preds, weights, h, y_trains):
    rmsse_values = [rmsse(y_true, y_pred, h, y_train) for y_true, y_pred, y_train in zip(y_trues, y_preds, y_trains)]
    return np.sum(np.array(weights) * np.array(rmsse_values))

In [None]:
# Evaluate the model on the test set
def evaluate_model_wrmsse(model, df_test, df_train, df_val, batch_size, DAYS_PER_SEQUENCE, n):
    test_gen = lstm_data_generator(df_test, target_col, DAYS_PER_SEQUENCE, batch_size)
    steps = max(1, len(df_test) // (batch_size * n))  # Ensure at least 1 step
    y_pred_normalized = model.predict(test_gen, steps=steps)
    y_pred_original = scaler.inverse_transform(y_pred_normalized)
    y_true_normalized = df_test[target_col].values
    y_true_original = scaler.inverse_transform(y_true_normalized)
    
    #First concatenate all elements used for training (df_train and df_val)
    y_train_all_normalized = pd.concat([df_train[target_col], df_val[target_col]], axis=0).values
    y_train_all_original = scaler.inverse_transform(y_train_all_normalized)
    
    # Reshape the predictions and actuals to separate each item's time series
    y_pred_series = [y_pred_original[i::NUM_ITEMS] for i in range(NUM_ITEMS)]
    y_true_series = [y_true_original[i::NUM_ITEMS] for i in range(NUM_ITEMS)]

    # Similarly reshape the training data for RMSSE calculation
    y_train_all_series = [y_train_all_original[i::NUM_ITEMS] for i in range(NUM_ITEMS)]

    # Check - can be deleted later on
    print('len y_pred_series: ' + len(y_pred_series))
    print('len y_true_series: ' + len(y_true_series))
    print('len y_train_all_series: ' + len(y_train_all_series))
    
    # Calculate WRMSSE
    weights = calculate_weights(sales_data)
    wrmsse_score = wrmsse(y_trues=y_true_series, y_preds=y_pred_series, weights=weights, h=28, y_trains=y_train_all_series)

    print("Test WRMSSE: ", wrmsse_score)
    
    
    
    
    
    
    
    
    
    # Calculate wrmsse score
    wrmsse_score = wrmsse(
        y_trues=y_true_original,
        y_preds=y_pred_original,
        weights=calculate_weights(sales_data),
        h=28, # forecast horizon
        y_train=y_train_all_original
    )
    print("Test WRMSSE: ", wrmsse_score)

In [None]:
# Call the evaluate function
# evaluate_model_wrmsse(model, df_test, df_train, df_val, batch_size, DAYS_PER_SEQUENCE, VAL_END)