In [None]:
import tensorflow as tf
tf.random.set_seed(42)
import numpy as np
np.random.seed(42)
import time
import pickle

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime
import plotly.offline as pyo
from plotly import subplots
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.tsa.api as smt

from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Model
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import LSTM, Dense,Input,concatenate
from tensorflow.keras.layers import *
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import lightgbm as lgb

from datetime import datetime, timedelta


from baseFunctions import *
from data_helpers import processData6, featureEngineering, getSequencesFast, removeOutliers, create_sequences


In [None]:
data, propDicts, flippedPropDicts = processData6()
data, timeFeatures = featureEngineering(data,splits=[2,2,2,2])

In [None]:
""" create training data based on lagged features not 2 sequences """
trainF = [
       #'store_nbr', 'family', 
       #'sales', 
       'onpromotion',# 'dataT',
       #'city', 'state', 'type', 'cluster', 
       'dcoilwtico', 
       'holidayType',
       'description', 
       'transferred', 
       #'transactions', 
       'store_closed']
timeF = [
       'linear_time', 'day_of_year', 'day_of_year_f12_0', 'day_of_year_f104_0','day_of_year_f24_0',  'day_of_year_f52_0',
       'day_of_year_f12_180', 'day_of_year_f104_180','day_of_year_f24_180','day_of_year_f52_180', 
       'weekday', 'month'
       ]

initial_lag = 1 # 16 = independent from previous predictions
lags = 21
rolling = [7,14]

# Date string
date_string_val = "2017-07-01"#"2017-05-01"
date_string_test = "2016-09-01"


for familyId in [0]:#[6]: #data.family.unique():
       # start with only some families
       if familyId > 8:
          continue

       print(familyId)
       familyDf = data.loc[data.family==familyId]  

       for storeId in [1]:#[41]: #data.store_nbr.unique():
              print('store',storeId)
              storeDf = familyDf.loc[(familyDf.store_nbr == storeId)] 
              storeDf = storeDf.loc[(storeDf.dataT == 'train')]# & (storeDf.date > "2015-07-01")]

              mask = (storeDf.date >= date_string_val)
              storeDf.loc[mask,['dataT']] = 'val'

              # ln tranformation
              storeDf.loc[:,['logSales']] = np.log(storeDf.sales + 1)

              relevantSales = storeDf.loc[storeDf.dataT =='train']
              dfLen = storeDf.shape[0]
              seasonF = []
              for period in [7, 14,21,28,52,365]:
                     dec = sm.tsa.seasonal_decompose(relevantSales.logSales,period = period, model = 'additive')
                     f = 'logSalesSeasonality_'+str(period)
                     storeDf.loc[:,[f]] = addSeasonality(period, dec, dfLen)
                     seasonF.append(f)

              storeDf.loc[:,['ref']] = storeDf['logSales'].shift(initial_lag)
              storeDf.loc[:,['target']] = storeDf['logSales'] - storeDf['ref']

              seasonF2 = []
              for f in seasonF:
                     newF = f+'_diff'+str(initial_lag)
                     storeDf.loc[:,[newF]] = storeDf[f].diff(initial_lag)
                     seasonF2.append(newF)

              # lag features / how many past datapoints are we tain
              featuresForLag = ['target'] + seasonF + seasonF2
              lagF = []#trainF
              for i in range(lags):
                     lag = i+initial_lag
                     newF = [featuresForLag[j] + '_lag' + str(lag) for j in range(len(featuresForLag))]
                     lagF = lagF + newF
                     storeDf.loc[:,newF] = storeDf[featuresForLag].shift(lag).to_numpy()

              # rolling features
              rollingF = []
              for rol in rolling:
                     for i in range(len(lagF)):
                            #if 'sales_t-16'  in lagF[i]:
                            if 'target'  in lagF[i]:
                                   fm = lagF[i]+'_rollingM' + str(rol)
                                   fs = lagF[i]+'_rollingS' + str(rol)
                                   rollingF.append(fm)
                                   rollingF.append(fs)
                                   storeDf.loc[:,[fm]] = storeDf[lagF[i]].rolling(rol).mean()#.copy()
                                   storeDf.loc[:,[fs]] = storeDf[lagF[i]].rolling(rol).std()#.copy()


              allF = lagF + rollingF + timeF + trainF+seasonF

              # we get a matrix that predicts only 1 timestamp -> stride it
              if len(rolling) == 0:
                     storeDf = storeDf.iloc[lags+initial_lag+1:-1]
              else:
                     storeDf = storeDf.iloc[lags+initial_lag+max(rolling)+1:-1]

              train_subDf = storeDf.loc[storeDf.date < date_string_test]
              test_subDf  = storeDf.loc[storeDf.date >= date_string_test]
              val_subDf   = storeDf.loc[storeDf.dataT =='val']


targetF = 'target'
baseTrain = train_subDf['ref'].to_numpy()
baseTest  = test_subDf['ref'].to_numpy()
baseVal   = val_subDf['ref'].to_numpy()

X_train = train_subDf[allF].to_numpy()
y_train = train_subDf[[targetF]].to_numpy()
X_test  =  test_subDf[allF].to_numpy()
y_test  =  test_subDf[[targetF]].to_numpy()  
X_val   =  val_subDf[allF].to_numpy()
y_val   =  val_subDf[[targetF]].to_numpy() 

np.isnan(X_train).any(),np.isnan(X_test).any(),np.isnan(X_val).any(),np.isnan(y_train).any(),np.isnan(y_test).any(),np.isnan(y_val).any()

# lgbm

In [None]:
 # compare against base lgbm that just predicts always t+16

def calcLossLGBM(pred, y, logTransform, predictDiff, base):
    if predictDiff:
        pred = np.reshape(pred, (y.shape[0])) +  np.reshape(base, (y.shape[0]))
        y = np.reshape(y, (y.shape[0])) + np.reshape(base, (y.shape[0]))

    if logTransform:
        a = np.exp(pred) -1
        y = np.exp(y) -1 
    else:
        a = (pred)
        y = (y)

    if (a < 0).any():
        a = np.clip(a, 0, 1e20)

    rmsleTrain = np.sqrt(mean_squared_log_error(a,y))
    return rmsleTrain
def plotLGBM(i, logTransform, pred, y, predictDiff, base):
    if predictDiff:
        pred = np.reshape(pred, (y.shape[0])) +  np.reshape(base, (y.shape[0]))
        y = np.reshape(y, (y.shape[0])) + np.reshape(base, (y.shape[0]))
    if logTransform:
        a = np.exp(pred[i:i+16])
        y = np.exp(y[i:i+16])
    else:
        a = (pred[i:i+16])
        y = (y[i:i+16])
    x = range(len(a))   

    fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))
    axs.plot(x, y, color='blue',label='Original')
    axs.plot(x, a, color='red',label='pred')
    axs.set_title('index: '+str(i))
    fig.subplots_adjust(hspace=0.5)

# Set parameters for LGBM model
params = {
    'boosting':'gbdt',#'gbdt', #'rf' #'dart'
    'objective': 'regression',  # Assuming you're doing regression
    'metric': 'mse',  # Mean squared error
    'num_leaves': 15,
    #'lambda_l1': 0.1,
    #'lambda_l2': 0.2,
    #'max_depth':10,
    #'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'force_col_wise':True,
    'num_iterations':100
}   

# Train the model
gbm = lgb.train(params, lgb.Dataset(X_train, label=y_train), valid_sets=[
    lgb.Dataset(X_test, label=y_test)
    #,lgb.Dataset(X_val, label=y_val)
    ]#,num_boost_round=100
,callbacks=[lgb.early_stopping(stopping_rounds=100)]
)  
predtrainLGBM = gbm.predict(X_train)
predtestLGBM = gbm.predict(X_test)
predvalLGBM = gbm.predict(X_val)
logTransform=True
predictDiff=True
#baseTrain, baseTest, baseVal = [],[],[]

print('errors: ', calcLossLGBM(predtrainLGBM, y_train, logTransform, predictDiff, base=baseTrain), calcLossLGBM(predtestLGBM, y_test, logTransform, predictDiff, baseTest), calcLossLGBM(predvalLGBM, y_val, logTransform, predictDiff, baseVal))

for i in range(0):
    plotLGBM(i*16, logTransform, predtrainLGBM, y_train, predictDiff, baseTrain)
    plotLGBM(i*16, logTransform, predtestLGBM, y_test, predictDiff, baseTest)
    plotLGBM(i*16, logTransform, predvalLGBM, y_val, predictDiff, baseVal)


In [None]:
importances = gbm.feature_importance()
for name, importance in zip(allF, importances):
    print(f'{name}: {importance}')

# residual analysis

In [None]:
corr = train_subDf[['target','logSalesSeasonality_365']].corr()

In [None]:
corr

# NN / log regression

In [None]:
""" use a fully connected NN  for sequence with 16"""
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

tf.keras.utils.set_random_seed(42)

n_features = X_train.shape[1]

model = Sequential()
model.add(Dense(n_features,input_shape=(n_features,)))
model.add(Dense(64))
model.add(Dense(1))

model.compile(loss='mse', optimizer='adam')
model.fit(X_train, y_train, epochs=4000, batch_size=3200,validation_data=(X_test, y_test))

predtrain = model.predict(X_train, verbose=False)
predtest  = model.predict(X_test, verbose=False)
logTransform=True
predictDiff=True
#baseTrain, baseTest, baseVal = [],[],[]

print('errors: ', calcLossLGBM(predtrain, y_train, logTransform, predictDiff, baseTrain), calcLossLGBM(predtest, y_test, logTransform, predictDiff, baseTest))


In [None]:
predtrain = model.predict(X_train, verbose=False)
predtest  = model.predict(X_test, verbose=False)
print('errors: ', calcLossLGBM(predtrain, y_train, logTransform, predictDiff, baseTrain), calcLossLGBM(predtest, y_test, logTransform, predictDiff, baseTest))

for i in range(1):
    plotLGBM(i*16, logTransform, predtrainLGBM, y_train, predictDiff, baseTrain)
    plotLGBM(i*16, logTransform, predtestLGBM, y_test, predictDiff, baseTest)