In [None]:
# Setting to adjust before each run:
MODEL_NAME = 'V2'
CODE_ENV = 'local' #'kaggle', 'aws', 'local'
TEST_END  = 1941 #1969 

In [None]:
#Import data handling libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Input, LSTM, Dense, Masking, RepeatVector
from keras.optimizers import Adam
from keras.metrics import RootMeanSquaredError
from keras import backend as K
from keras.callbacks import Callback
import tensorflow as tf

In [None]:
# Check if GPU is available
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print(tf.test.is_built_with_cuda())

In [None]:
#Specify directories
if CODE_ENV=='local':
    ###local###
    #get parent folder of current directory
    parent_dir = '/Users/mf/Desktop/CS/Studies/7_Final_Project/Kaggle_M5PointPrediction'

    #Directory resources
    res_dir = parent_dir + '/res/'
    src_dir = parent_dir + '/src/'
    prc_dir = src_dir + 'processed_data/' # Processed data directory with pickled dataframes

if CODE_ENV=='kaggle':
    ###On Kaggle###
    res_dir = '/kaggle/input/m5-forecasting-accuracy/'
    prc_dir = '/kaggle/input/processed-data-v3/'

if CODE_ENV=='aws':
    parent_dir = '/home/ubuntu/projects/Kaggle_M5PointPrediction'
    res_dir = parent_dir + '/res/'
    src_dir = parent_dir + '/src/'
    prc_dir = src_dir + 'processed_data/' # Processed data directory with pickled dataframes

In [None]:
# Create variables
BASE      = prc_dir +'df_1.pkl'
CALENDAR  = prc_dir +'df_2.pkl'
NUM_ITEMS = 30490 # Number of items per each day
# Set time_steps for defining test, train and validation sets
time_steps = 7  # Number of days per sequence

In [None]:
# Read in df_train_conv from pickle file
df_all_data = pd.concat([pd.read_pickle(BASE),
           pd.read_pickle(CALENDAR)], 
           axis=1)

In [None]:
# Define categorical and numerical columns
categorical_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'is_available',
                    'd', 'wday', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 
                    'snap_CA', 'snap_TX', 'snap_WI', 'mday', 'week', 'month', 'year']
numerical_cols = ['sell_price']

target_col = 'sales_amount'

# Convert categorical columns to category dtype and encode with cat.codes
for col in categorical_cols:
    df_all_data[col] = df_all_data[col].astype('category').cat.codes

# Normalize numerical columns
scaler = MinMaxScaler()
df_all_data[numerical_cols] = scaler.fit_transform(df_all_data[numerical_cols].astype(np.float32))
df_all_data[target_col] = scaler.fit_transform(df_all_data[[target_col]].astype(np.float32))

In [None]:
# Splitting the data in train, validation and test set; days are now 0 based, so have to shift by 1
# Define duration in days of each set
VAL_DUR   = 28
TEST_DUR  = 28

# Define end days of training set for each set
VAL_END   = TEST_END - TEST_DUR
TRAIN_END = VAL_END - VAL_DUR # 1885 -> Train only until the 28 days before the end of the data

# Finally define duration in days for the train set
TRAIN_DUR = TRAIN_END # Depends on whether the whole dataset is used or last the 28 days for validation 

df_train = df_all_data[df_all_data['d'] < TRAIN_END].reset_index(drop=True)
df_val   = df_all_data[(df_all_data['d'] >= TRAIN_END - time_steps) & (df_all_data['d'] < VAL_END)].reset_index(drop=True) #35 days because of the time_steps shift
df_test  = df_all_data[(df_all_data['d'] >= VAL_END - time_steps)   & (df_all_data['d'] < TEST_END)].reset_index(drop=True) #35 days because of the time_steps shift

# Delete df_all_data to free up memory as data is now stored in df_train, df_val and df_test
del df_all_data

In [None]:
# Version 1: 
# x input: 7 days without sales_amount
# y labels: only 8th day sales_amount

# Custom Generator Function
# def lstm_data_generator(df, target, days_per_sequence=7, batch_size=32):
#     total_sequences = (len(df) - NUM_ITEMS * days_per_sequence) // NUM_ITEMS # 1878 for train, 21 for val and test; (1941*30490-7*30490)
#     while True: 
#         for i in range(0, total_sequences, batch_size): # 0, 32, 64, ...1878
#             batch_sequences = []
#             batch_targets = []
#             for b in range(batch_size): # 0, 1, 2,... 31
#                 if i + b < total_sequences: # 0, 0; 0, 1; 0, 2; ...; 0, 32; 32, 0; 32, 1; ...
#                     start_idx = (i + b) * NUM_ITEMS
#                     end_idx = start_idx + NUM_ITEMS * days_per_sequence
#                     batch_sequences.append(df.iloc[start_idx:end_idx, :].drop(target, axis=1).to_numpy()) #drop target column, Only the values in the DataFrame will be returned, the axes labels will be removed.
#                     batch_targets.append(df.iloc[end_idx:end_idx + NUM_ITEMS][target].to_numpy())
#             yield np.array(batch_sequences), np.array(batch_targets)



# To-Do
# -Für bisherigen Generator: prediction erstellen und bei Kaggle einreichen. Völlig egal ob shape passt, hauptsache shape beim training identisch
#    - Jeweils für 7 Tage x Werte und 8. Tag y labels
#    - 8. Tag X Werte und 8 Tag y labels
# -Testen, ob ich generator bauen kann, der 30,490 als sequenz ausgibt (30490,7,20) als input shape ausgibt und total_sequences kann glaube sogar bleiben


# Version 2:
# This generator creates:
# Number of batches: 1878 (for df_train); df_val and df_test have 28 batches
# X: (30490, 7, 20) --> 7 days without sales_amount
# Y: (30490, 1)     --> only 8th day sales_amount

def lstm_data_generator(df, target, days_per_sequence=7, batch_size=30490):
    while True:
        length_days = len(df) // NUM_ITEMS
        for i in range(length_days-time_steps): # 0 - 1877; 0-27
            # Initialize arrays for storing sequences and targets
            batch_sequences = np.zeros((batch_size, days_per_sequence, df.shape[1] - 1))  # minus 1 for target column; 30490, 7, 20
            batch_targets = np.zeros((batch_size, ))

            # Loop over all items for the current day
            for item_idx in range(batch_size): #(0, 30489)
                start_idx = item_idx + (i * NUM_ITEMS) # 0+0*30490; 1+0*30490;...
                end_idx = start_idx + (days_per_sequence * NUM_ITEMS)

                # Extract sequence for current item
                sequence = df.iloc[start_idx:end_idx:NUM_ITEMS].drop(target, axis=1).to_numpy()
                batch_sequences[item_idx, :, :] = sequence

                # Extract target for current item
                target_value = df.iloc[end_idx + NUM_ITEMS][target]
                batch_targets[item_idx] = target_value

            yield batch_sequences, batch_targets








# # Function to create sequences
# def create_sequences(dataframe, window_size=7):
#     X, Y = [], []
#     for i in range(len(dataframe) - window_size):
#         # Extract 7 days of data with all 20 features
#         x_sequence = dataframe.iloc[i:i+window_size, :].values

#         # Extract the 19 known features for the 8th day
#         x_8th_day_known = dataframe.iloc[i + window_size, :-1].values.reshape(1, -1)  # Excluding sales amount

#         # Add the new binary feature indicating the prediction day
#         prediction_day_indicator = np.zeros((window_size, 1))
#         prediction_day_indicator_8th_day = np.array([[1]])  # 1 for the 8th day
#         x_sequence = np.hstack((x_sequence, prediction_day_indicator))
#         x_8th_day_with_indicator = np.hstack((x_8th_day_known, prediction_day_indicator_8th_day))

#         # Concatenate 7 days of data with the known features of the 8th day
#         x_sequence_with_8th_day = np.concatenate([x_sequence, x_8th_day_with_indicator], axis=0)

#         # The target is the sales amount for the 8th day
#         y_value = dataframe.iloc[i + window_size, -1]  # Sales amount for the 8th day

#         X.append(x_sequence_with_8th_day)
#         Y.append(y_value)
#     return np.array(X), np.array(Y)

# # Create sequences
# X, Y = create_sequences(dataframe)

# # Convert to tf.data.Dataset for training
# dataset = tf.data.Dataset.from_tensor_slices((X, Y))
# dataset = dataset.batch(batch_size)  # Define your batch_size















# def lstm_data_generator(df, target, days_per_sequence=7, batch_size=32):
#     total_sequences = (len(df) - NUM_ITEMS * (days_per_sequence + 1)) // NUM_ITEMS
#     while True:
#         for i in range(0, total_sequences, batch_size):
#             batch_sequences = []
#             batch_targets = []

#             for b in range(batch_size):
#                 if i + b < total_sequences:
#                     start_idx = (i + b) * NUM_ITEMS
#                     end_idx = start_idx + NUM_ITEMS * days_per_sequence
#                     next_day_idx = end_idx + NUM_ITEMS

#                     # Sequence data with target for past time_step days
#                     sequence_data = df.iloc[start_idx:end_idx, :].copy()
                    
#                     # Adding is_current_day feature to let the model distinguish between the past data and current to predicting day
#                     # create a new column 'is_current_day' and fill it with 0s and set datatype to int8
#                     sequence_data['is_current_day'] = 0
#                     sequence_data['is_current_day'] = sequence_data['is_current_day'].astype(np.int8)

#                     # Data for the current to predicting day without target, because in real life we don't have it
#                     sixth_day_data = df.iloc[end_idx:next_day_idx, :].copy()
#                     #fill column 'sales_amount' with NaNs
#                     sixth_day_data[target] = np.nan
#                     # Give model info that this is the current day
#                     sixth_day_data['is_current_day'] = 1
#                     sixth_day_data['is_current_day'] = sixth_day_data['is_current_day'].astype(np.int8)

#                     # Combine data
#                     sequence_with_sixth_day = pd.concat([sequence_data, sixth_day_data], axis=0)

#                     # Append to batch
#                     batch_sequences.append(sequence_with_sixth_day.to_numpy())

#                     # Target for the 6th day
#                     batch_targets.append(df.iloc[end_idx:next_day_idx, :][target].to_numpy())

#             yield np.array(batch_sequences), np.array(batch_targets)

In [None]:
# Model parameters
batch_size = 30490  #Size of each batch
epochs = 2
num_cols = df_train.shape[1]

In [None]:
# Train and validation generators
train_generator = lstm_data_generator(df_train, target_col, time_steps, batch_size)
val_generator = lstm_data_generator(df_val, target_col, time_steps, batch_size)

In [None]:
# For testing purposes: check how large on batch is
# next train_generator
# x, y = next(train_generator)
# size of memory in mb of x and y
# print(x.nbytes / 1e6)
# print(y.nbytes / 1e6)

In [None]:
# Custom RMSE loss function
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
# This is a sequence-to-sequence model: errors can propagate through the sequence
# model = Sequential()

# model.add(LSTM(units=30,
#                activation='tanh', #relu
#                return_sequences=False,
#                stateful=True))

# model.add(RepeatVector(28))

# model.add(LSTM(units=30, 
#                activation='tanh', 
#                return_sequences=True, 
#                stateful=True))

# model.add(Dense(units=1))

# model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
# Architecture to setup when predicting single day steps ahead and not using the repeat vector
model = Sequential()

# First LSTM layer
model.add(LSTM(units=50, 
               activation='tanh',
               return_sequences=False,
               stateful=True))

# Dense layer to make predictions for each day multiplies the number of units by the number of days you want to predict
model.add(Dense(units=NUM_ITEMS * TEST_DUR))

# Reshape the output to be (number of days, number of items)
model.add(Reshape((TEST_DUR, NUM_ITEMS)))

model.compile(optimizer='adam', 
              loss='mse', # rmse
              metrics=[RootMeanSquaredError()])

In [None]:
# For tracking purposes: check the models parameters
#model.summary()

# Print input shape of the layers
for layer in model.layers:
    print(layer.input_shape)

In [None]:
class ResetStatesCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        self.model.reset_states()

In [None]:
# Training the model
history = model.fit(x=train_generator,
          steps_per_epoch=TRAIN_DUR,  # total number of sequences in the training set
          validation_data=val_generator,
          validation_steps=VAL_DUR,  # total number of sequences in the validation set
          epochs=epochs,
          callbacks=[ResetStatesCallback()])

In [None]:
# Train and validation df not needed anymore
del df_train
del df_val

In [None]:
# Save the model to a specified directory
if CODE_ENV=='local':
    ###local###
    model.save(src_dir + 'models/' + MODEL_NAME + '.h5')
    
if CODE_ENV=='kaggle':
    ###On Kaggle###
    model.save('/kaggle/working/V1_without_input_shape-model.h5')

if CODE_ENV=='aws':
    ###aws###
    model.save(src_dir + 'models/V1.h5')

In [None]:
# Start from here if you want to load the model
from keras.models import load_model

# Load the model from a specified directory
if CODE_ENV=='local':
    ###local###
    model = load_model(src_dir + 'models/V2.h5', custom_objects={'rmse': rmse})

if CODE_ENV=='kaggle':
    ###On Kaggle###
    model = load_model('/kaggle/input/v1-model/V2.h5', custom_objects={'rmse': rmse})

if CODE_ENV=='aws':
    ###aws###
    model.save(src_dir + 'models/V2.h5', custom_objects={'rmse': rmse})

In [None]:
import matplotlib.pyplot as plt
try:
    # Plot training & validation loss values
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')
    plt.show()
except:
    print('No history to plot')

In [None]:
# def prepare_forecast_input(df, time_steps, num_items):
#     #df_test starts at 1942-7 which we need take into account
#     # Prepare input data for forecasting
#     forecast_input = []
#     for target_day in range(28):
#         start_idx = target_day * num_items
#         end_idx = start_idx + time_steps * num_items
#         sequence = df.iloc[start_idx:end_idx].drop('sales_amount', axis=1).to_numpy()
#         forecast_input.append(sequence)
#     return np.array(forecast_input)


# Custom function for input to prepare forecasts input for model
# def prepare_forecast_input(df, target, model, time_steps, num_items):
#     forecast_output = []
#     for target_day in range(28):
#         start_idx = target_day * num_items
#         end_idx = start_idx + time_steps * num_items
#         sequence = df.iloc[start_idx:end_idx, : ].drop(target, axis=1).to_numpy()
#         # forecast_output.append(model.predict(sequence))
#         forecast_output.append(model.predict(sequence.reshape(1, sequence.shape[0], sequence.shape[1])))
#     return np.array(forecast_output)#.reshape(-1, 1)
# forecast_output = prepare_forecast_input(df_test, target_col, model, time_steps, NUM_ITEMS)
#forecasts_original = scaler.inverse_transform(forecast_output)



In [None]:
# Assuming df_all_data contains all data up to day 1941
# forecast_input = prepare_forecast_input(df_test, time_steps, NUM_ITEMS)

# Generate forecasts
# forecasts = model.predict(forecast_input)
# forecasts_original = scaler.inverse_transform(forecasts)

# forecasts_original now contains the predicted sales amounts for days 1942 to 1969


In [None]:
# Prepare input for forecasts
# I cannot use the custom lstm_data_generator
# Prepare 7 day slices each shifted by one day
def prepare_forecast_input(df, time_steps, target_col):
    forecast_input = []
    for i in range(0, len(df)//NUM_ITEMS): #i=0; 1, 2, 3, ..., 35?
        if i + time_steps < (len(df)-1)//NUM_ITEMS: #7, 8, 9, 10, ...
            start_idx = i*NUM_ITEMS
            end_idx   = start_idx + NUM_ITEMS * time_steps
            sequence  = df.iloc[start_idx : end_idx, :].drop(target_col, axis=1).to_numpy()
            forecast_input.append(sequence)
    return np.array(forecast_input)

predict_array = prepare_forecast_input(df=df_test, time_steps=time_steps, target_col=target_col)

In [None]:
predict_array.shape

In [None]:
forecast_normalized = model.predict(predict_array)

In [None]:
forecasts_original = scaler.inverse_transform(forecast_normalized)

In [None]:
forecasts_original.shape

In [None]:
# Now, let's define a function to calculate WRMSSE by calculating the RMSSE for each series and then multiplying by the weights and summing them up. 
def calculate_weights(sales_data, last_n_days=28):
    # sales_data: DataFrame with columns ['item_id', 'day', 'sales']
    # Sum sales for each item over the last_n_days
    item_sales = sales_data[sales_data['day'] > sales_data['day'].max() - last_n_days].groupby('item_id')['sales'].sum()
    # Total sales for all items
    total_sales = item_sales.sum()
    # Calculate weights
    weights = item_sales / total_sales
    return weights

def rmsse(y_true, y_pred, h, y_train):
    numerator = np.sum((y_true - y_pred) ** 2) / h
    denominator = np.sum(np.diff(y_train) ** 2) / (len(y_train) - 1) # np.diff to calc the diff for consecutive elements
    return np.sqrt(numerator / denominator)

def wrmsse(y_trues, y_preds, weights, h, y_trains):
    rmsse_values = [rmsse(y_true, y_pred, h, y_train) for y_true, y_pred, y_train in zip(y_trues, y_preds, y_trains)]
    return np.sum(np.array(weights) * np.array(rmsse_values))

In [None]:
# Evaluate the model on the test set
def evaluate_model_wrmsse(model, df_test, df_train, df_val, batch_size, time_steps, n):
    test_gen = lstm_data_generator(df_test, target_col, time_steps, batch_size)
    steps = max(1, len(df_test) // (batch_size * n))  # Ensure at least 1 step
    y_pred_normalized = model.predict(test_gen, steps=steps)
    y_pred_original = scaler.inverse_transform(y_pred_normalized)
    y_true_normalized = df_test[target_col].values
    y_true_original = scaler.inverse_transform(y_true_normalized)
    
    #First concatenate all elements used for training (df_train and df_val)
    y_train_all_normalized = pd.concat([df_train[target_col], df_val[target_col]], axis=0).values
    y_train_all_original = scaler.inverse_transform(y_train_all_normalized)
    
    # Reshape the predictions and actuals to separate each item's time series
    y_pred_series = [y_pred_original[i::NUM_ITEMS] for i in range(NUM_ITEMS)]
    y_true_series = [y_true_original[i::NUM_ITEMS] for i in range(NUM_ITEMS)]

    # Similarly reshape the training data for RMSSE calculation
    y_train_all_series = [y_train_all_original[i::NUM_ITEMS] for i in range(NUM_ITEMS)]

    # Check - can be deleted later on
    print('len y_pred_series: ' + len(y_pred_series))
    print('len y_true_series: ' + len(y_true_series))
    print('len y_train_all_series: ' + len(y_train_all_series))
    
    # Calculate WRMSSE
    weights = calculate_weights(sales_data)
    wrmsse_score = wrmsse(y_trues=y_true_series, y_preds=y_pred_series, weights=weights, h=28, y_trains=y_train_all_series)

    print("Test WRMSSE: ", wrmsse_score)
    
    
    
    
    
    
    
    
    
    # Calculate wrmsse score
    wrmsse_score = wrmsse(
        y_trues=y_true_original,
        y_preds=y_pred_original,
        weights=calculate_weights(sales_data),
        h=28, # forecast horizon
        y_train=y_train_all_original
    )
    print("Test WRMSSE: ", wrmsse_score)

In [None]:
# Call the evaluate function
# evaluate_model_wrmsse(model, df_test, df_train, df_val, batch_size, time_steps, VAL_END)