# Goals
1. predict for every store individually
- make stationary target by diff, yes/no?
- z score normalization on train data
- predict next 16 values directly vs recursively?
2. predict store individually but with every pair/family as parameter
- needs zscore normalization
- stationary target yes/no?
3. predict all store/family pairs simultaneously
- zscore? maybe not needed
- stationary?

features:
1. time features:
- linear timestamp
- sin/cos of year, check for (week/month) if pattern present
- encoding of weekday, maybe also month
2. oil/holidays/location should be ok


In [None]:
import tensorflow as tf
tf.random.set_seed(42)
import numpy as np
np.random.seed(42)

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime
import plotly.offline as pyo
from plotly import subplots
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.tsa.api as smt

from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Model
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import LSTM, Dense,Input,concatenate
from tensorflow.keras.layers import *
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam


from baseFunctions import *
from data_helpers import processData6

In [None]:
data, propDicts, flippedPropDicts = processData6()

# feature engineering

aggregated data
- there is some linear trend 

In [None]:
dailyData = data.groupby('date')['sales'].sum()
dec = sm.tsa.seasonal_decompose(dailyData,period = 12, model = 'additive').plot()
plt.show()

In [None]:
plot_periodogram(dailyData, 365, n_domFreq=30)

# strong frequencies     => TimePeriod 
# 52 (weekly) 365/52     = 7 days
# 24 (biweekly) 365/24   = 15 days (half-month)
# 104 (halfweek) 365/104 = 3.5 = 3.5 days 
# 12 (monthly)  365/12   = 30 days
# 6 (bimonthly)          = 60 days
# 4 (quarters)           = 90 days
# 3 (thirds)             = 120 days
# 2 (half-year)          = 182
# 1 (yearly)             = 365 

In [None]:
data1 = data.copy()

# add linear time
data1['linear_time'] = (data1['date'] - data1['date'].iloc[0]).dt.days +1
data1['day_of_year'] = data1['date'].dt.day_of_year

data1, periodicfeat = addFourierFeature(data1, n_splits = 6, frequency=1, feature='day_of_year', referenceTimespan = 365)
data1, periodicfeat = addFourierFeature(data1, n_splits = 6, frequency=2, feature='day_of_year', referenceTimespan = 365)
data1, periodicfeat = addFourierFeature(data1, n_splits = 6, frequency=3, feature='day_of_year', referenceTimespan = 365)
data1, periodicfeat = addFourierFeature(data1, n_splits = 6, frequency=4, feature='day_of_year', referenceTimespan = 365)
data1, periodicfeat = addFourierFeature(data1, n_splits = 6, frequency=6, feature='day_of_year', referenceTimespan = 365)
data1, periodicfeat = addFourierFeature(data1, n_splits = 6, frequency=12, feature='day_of_year', referenceTimespan = 365)
data1, periodicfeat = addFourierFeature(data1, n_splits = 6, frequency=104, feature='day_of_year', referenceTimespan = 365)
data1, periodicfeat = addFourierFeature(data1, n_splits = 6, frequency=24, feature='day_of_year', referenceTimespan = 365)
data1, periodicfeat = addFourierFeature(data1, n_splits = 6, frequency=52, feature='day_of_year', referenceTimespan = 365)

data1['weekday'] = data1['date'].dt.weekday
data1['month'] = data1['date'].dt.month


# individual prediction

some approaches work for large values but not for small and vice versa
predicting large & small values at the same time is hard

In [None]:
flippedPropDicts['family']

In [None]:
train = data1.loc[(data1.dataT == 'train') & (data1.store_nbr == 1) & (data1.family == 3)]
fig = subplots.make_subplots(rows=2, cols=1, shared_xaxes='all')
fig.add_trace(go.Scattergl(x=train.date, y=train.sales), col=1, row = 1)
fig.add_trace(go.Scattergl(x=train.date, y=train.weekday), col=1, row = 2)
fig.add_trace(go.Scattergl(x=train.date, y=train.holidayType), col=1, row = 2)

In [None]:
cv_scores = cross_val_score(model, X, y, cv=5)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

print("Mean CV score:", cv_scores.mean())
print("Standard deviation of CV scores:", cv_scores.std())

### LGBM

In [None]:
import lightgbm as lgb

In [None]:
trainF = [
       #'store_nbr', 'family', 
       #'sales',
       'onpromotion', 
       #'dataT',
       #'city', 'state', 'type', 'cluster',
       'dcoilwtico', 
       'holidayType',
       'description', 'transferred', 
       'linear_time', 
       'day_of_year',
       'weekday',
       'month',

       #'day_of_year_f1_0', 
       # 'day_of_year_f1_60', 'day_of_year_f1_120',
       #'day_of_year_f1_180', 'day_of_year_f1_240', 'day_of_year_f1_300',
       #'day_of_year_f2_0', 
       # 'day_of_year_f2_60', 'day_of_year_f2_120',
       #'day_of_year_f2_180', 'day_of_year_f2_240', 'day_of_year_f2_300',
       #'day_of_year_f3_0',
       #  'day_of_year_f3_60', 'day_of_year_f3_120',
       #'day_of_year_f3_180', 'day_of_year_f3_240', 'day_of_year_f3_300',
       #'day_of_year_f4_0',# 'day_of_year_f4_60', 'day_of_year_f4_120',
       #'day_of_year_f4_180', 'day_of_year_f4_240', 'day_of_year_f4_300',
       #'day_of_year_f6_0', #'day_of_year_f6_60', 'day_of_year_f6_120',
       #'day_of_year_f6_180', 'day_of_year_f6_240', 'day_of_year_f6_300',
       'day_of_year_f12_0',
          #'day_of_year_f12_60',
       #'day_of_year_f12_120',
       #'day_of_year_f12_180',# 
       #'day_of_year_f12_240', #'day_of_year_f12_300',
       'day_of_year_f104_0',
        # 'day_of_year_f104_60',
       #'day_of_year_f104_120',
       #'day_of_year_f104_180',# 
       #'day_of_year_f104_240',# 'day_of_year_f104_300',
       'day_of_year_f24_0', 
       #'day_of_year_f24_60', 
       #'day_of_year_f24_120',
       #'day_of_year_f24_180', 
       #'day_of_year_f24_240',# 'day_of_year_f24_300',
       'day_of_year_f52_0',
       # 'day_of_year_f52_60', 
       #'day_of_year_f52_120',
       #'day_of_year_f52_180'#, 
       #'day_of_year_f52_240'#, 'day_of_year_f52_300'
       
       ]

train0 = trainF + ['sales']

train = data1.loc[(data1.dataT == 'train') & (data1.store_nbr == 1) & (data1.family == 18)] # family 18

n_predictedValues = 16
look_back = 100
zScoreNorm = True

sequence0 = []
sequence1 = []
labels = []

# zscore over all values -> not ideal bc test data
if zScoreNorm:
    mean = train.sales.mean()
    mean = 0 # modified zScore, not in mean = 0
    std = train.sales.std()
    train['sales'] = (train.sales - mean) / std

for i in range(train.shape[0]-look_back-n_predictedValues):
    startS0 = i
    endS0 = startS0 + look_back
    endS1 = endS0 + n_predictedValues
    sequence0.append(train[train0].iloc[startS0:endS0].to_numpy().flatten())
    sequence1.append(train[trainF].iloc[endS0:endS1].to_numpy().flatten())
    labels.append(train['sales'].iloc[endS0:endS1])
sequence0 = np.stack(sequence0, axis = 0)
sequence1 = np.stack(sequence1, axis=0)
labels    = np.stack(labels, axis = 0)



In [None]:
X = np.concatenate((sequence0, sequence1), axis=1)
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

# Set parameters for LGBM model
params = {
    'objective': 'regression',  # Assuming you're doing regression
    'metric': 'msle',  # Mean squared error
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}


# Train the model
num_round = 100
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=10)

# Predict on test data
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)



#forecast = model.predict(X_train, verbose=False)
#if zScoreNorm:
#    forecast = forecast *std  + mean
#    y_train = y_train *std + mean
#rmsleTrain = np.sqrt(mean_squared_log_error(forecast, y_train))
#forecast = model.predict(X_test, verbose=False)
#if zScoreNorm:
#    forecast = forecast *std  + mean
#    y_test = y_test*std + mean
#rmsleTest = np.sqrt(mean_squared_log_error(forecast, y_test))
#print('errors:  ', round(rmsleTrain,3), round(rmsleTest,3), y_train.shape[0], y_test.shape[0])

### LSTM approach
only works either for small OR for large values

In [None]:
trainF = [
       #'store_nbr', 'family', 
       #'sales',
       'onpromotion', 
       #'dataT',
       #'city', 'state', 'type', 'cluster',
       'dcoilwtico', 
       'holidayType',
       'description', 'transferred', 
       'linear_time', 
       'day_of_year',
       'weekday',
       'month',

       #'day_of_year_f1_0', 
       # 'day_of_year_f1_60', 'day_of_year_f1_120',
       #'day_of_year_f1_180', 'day_of_year_f1_240', 'day_of_year_f1_300',
       #'day_of_year_f2_0', 
       # 'day_of_year_f2_60', 'day_of_year_f2_120',
       #'day_of_year_f2_180', 'day_of_year_f2_240', 'day_of_year_f2_300',
       #'day_of_year_f3_0',
       #  'day_of_year_f3_60', 'day_of_year_f3_120',
       #'day_of_year_f3_180', 'day_of_year_f3_240', 'day_of_year_f3_300',
       #'day_of_year_f4_0',# 'day_of_year_f4_60', 'day_of_year_f4_120',
       #'day_of_year_f4_180', 'day_of_year_f4_240', 'day_of_year_f4_300',
       #'day_of_year_f6_0', #'day_of_year_f6_60', 'day_of_year_f6_120',
       #'day_of_year_f6_180', 'day_of_year_f6_240', 'day_of_year_f6_300',
       'day_of_year_f12_0',
          #'day_of_year_f12_60',
       #'day_of_year_f12_120',
       #'day_of_year_f12_180',# 
       #'day_of_year_f12_240', #'day_of_year_f12_300',
       'day_of_year_f104_0',
        # 'day_of_year_f104_60',
       #'day_of_year_f104_120',
       #'day_of_year_f104_180',# 
       #'day_of_year_f104_240',# 'day_of_year_f104_300',
       'day_of_year_f24_0', 
       #'day_of_year_f24_60', 
       #'day_of_year_f24_120',
       #'day_of_year_f24_180', 
       #'day_of_year_f24_240',# 'day_of_year_f24_300',
       'day_of_year_f52_0',
       # 'day_of_year_f52_60', 
       #'day_of_year_f52_120',
       #'day_of_year_f52_180'#, 
       #'day_of_year_f52_240'#, 'day_of_year_f52_300'
       
       ]

train0 = trainF + ['sales']

train = data1.loc[(data1.dataT == 'train') & (data1.store_nbr == 1) & (data1.family == 18)] # family 18

n_predictedValues = 16
look_back = 100
zScoreNorm = True

sequence0 = []
sequence1 = []
labels = []

# zscore over all values -> not ideal bc test data
if zScoreNorm:
    mean = train.sales.mean()
    mean = 0 # modified zScore, not in mean = 0
    std = train.sales.std()
    train['sales'] = (train.sales - mean) / std

for i in range(train.shape[0]-look_back-n_predictedValues):
    startS0 = i
    endS0 = startS0 + look_back
    endS1 = endS0 + n_predictedValues
    sequence0.append(train[train0].iloc[startS0:endS0])
    sequence1.append(train[trainF].iloc[endS0:endS1])
    labels.append(train['sales'].iloc[endS0:endS1])
sequence0 = np.stack(sequence0, axis = 0)
sequence1 = np.stack(sequence1, axis=0)
labels    = np.stack(labels, axis = 0)


try:
    tf.keras.utils.set_random_seed(42)
except:
    print('using new tf')

tf.random.set_seed(0)

n_features = len(train0)

input1 = Input(shape=(look_back, n_features))
input2 = Input(shape=(n_predictedValues, n_features-1))

lstm1 = LSTM(64, activation='relu', return_sequences=False)(input1)
lstm2 = LSTM(64, activation='relu', return_sequences=False)(input2)

#lstm1 = LSTM(64, activation='relu', return_sequences=False, kernel_regularizer=regularizers.l2(0.001))(lstm1)
#lstm2 = LSTM(64, activation='relu', return_sequences=False, kernel_regularizer=regularizers.l2(0.001))(lstm2)

#lstm2 = Dense(n_predictedValues, activation='relu')(lstm2)
x = tf.keras.layers.concatenate([lstm1, lstm2])
x = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
if zScoreNorm:
    output = Dense(n_predictedValues, activation='relu')(x)
else:
    output = Dense(n_predictedValues, activation='relu')(x)

# Define the model
model = Model(inputs=[input1, input2], outputs=output)

optimizer = Adam(learning_rate=0.00001)
model.compile(optimizer='adam', loss='mae', metrics=[tf.keras.losses.MSLE]) 
#model.compile(optimizer='adam', loss=tf.keras.losses.MSLE, metrics=['mae']) 

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

X = [sequence0,sequence1]
y = labels

for train_index, test_index in tscv.split(sequence0):
    X_train = [sequence0[train_index],sequence1[train_index]]
    X_test  = [sequence0[test_index], sequence1[test_index]]
    y_train, y_test = labels[train_index], labels[test_index]

    model.fit(X_train, y_train, epochs=20, batch_size=32,validation_data=(X_test, y_test))


    forecast = model.predict(X_train, verbose=False)
    if zScoreNorm:
        forecast = forecast *std  + mean
        y_train = y_train *std + mean
    rmsleTrain = np.sqrt(mean_squared_log_error(forecast, y_train))
    forecast = model.predict(X_test, verbose=False)
    if zScoreNorm:
        forecast = forecast *std  + mean
        y_test = y_test*std + mean
    rmsleTest = np.sqrt(mean_squared_log_error(forecast, y_test))
    print('errors:  ', round(rmsleTrain,3), round(rmsleTest,3), y_train.shape[0], y_test.shape[0])

In [None]:
y_test, forecast
# --------------seed = 0 -------------------------------------------------------------------------------------------------------------
# --------------familyId = 3 (Beverages), store id = 1 -----------(train 1307 test 261)---------- 7.219 7.652 == all 0 ----------------
# --------------5 splits, 10 epochs per split, 32 batch size---------------------------------------------------------------------------
# errors:   0.856 0.740     without all the time featuers: 'dcoilwtico', 'holidayType','description', 'transferred', 'linear_time', 'day_of_year','weekday', 'month',
# errors:   7.219 7.652     with all the time featuers
# errors:   2.885 4.493     without oil a lot worse :o     'holidayType','description', 'transferred', 'linear_time', 'day_of_year','weekday', 'month',
# errors:   1.533 2.071     without descrip.transf         'dcoilwtico', 'holidayType', 'linear_time', 'day_of_year','weekday', 'month',
# errors:   1.262 1.156     without time features          'dcoilwtico', 'holidayType','description', 'transferred',
# errors:   7.219 7.653     without day of year & month    'dcoilwtico', 'holidayType','description', 'transferred', 'linear_time','weekday',
# errors:   7.219 7.653     without month                  'dcoilwtico', 'holidayType','description', 'transferred', 'linear_time', 'day_of_year','weekday',
# errors:   1.247 1.128     without lin time & day of y    'dcoilwtico', 'holidayType','description', 'transferred','weekday', 'month',
# errors:   7.219 7.653     without day of year            'dcoilwtico', 'holidayType','description', 'transferred', 'linear_time','weekday', 'month',

# testing fourier features, lock those features (always use) 'dcoilwtico', 'holidayType','description', 'transferred', 'linear_time', 'day_of_year','weekday', 'month',
# errors:   0.624 0.694     frequency: 12, 24, 52, 104 only 0 phase diff:       day_of_year_f12_0, day_of_year_f104_0, day_of_year_f24_0, day_of_year_f52_0
# errors:   3.162 3.661     frequency: 12, 24, 52, 104  6x 60° phase diff:
# errors ->nans  frequency: 12, 24, 52, 104 only 0 & 180°:
# errors:   0.705 0.731     frequency: 12, 24, 52, 104 only 0, 120, 240° phase diff:
# errors:   6.92  7.155     frequency: 1,2,3,4,6,12, 24, 52, 104 only 0 phase diff:
# errors:   7.211 7.595     frequency: 1,2,6,12, 24, 52, 104 only 0 phase diff:
# errors:   4.929 5.107     frequency: 6,12, 24, 52, 104 only 0 phase diff:
# errors ->nans  frequency: 4, 12, 24, 52, 104 only 0:
# errors:   2.155 1.544     frequency: 3,12, 24, 52, 104 only 0 phase diff:
# errors ->nans  frequency: 1, 12, 24, 52, 104 only 0:
# errors ->nans  frequency: 2, 12, 24, 52, 104 only 0:

# --------------familyId = 3 (Beverages), store id = 18 ----------(train 1307 test 261)---------- 3.873 5.123 == all 0 ----------------
# --------------5 splits, 10 epochs per split, 32 batch size---------------------------------------------------------------------------
#errors:   3.873 5.123      'dcoilwtico', 'holidayType','description', 'transferred', 'linear_time', 'day_of_year','weekday', 'month', frequency: 12, 24, 52, 104 only 0 phase diff
# -> zscaling doesn't help, nothing really works for this one
# 

In [None]:
(forecast != 0).any()

# predict in one big dataframe

In [None]:
grouped = (data1.loc[data1.dataT == 'train'].pivot(index='date', columns=['store_nbr', 'family']))#.transpose#()