In [None]:
import tensorflow as tf
tf.random.set_seed(42)
import numpy as np
np.random.seed(42)
import time
import pickle

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime
import plotly.offline as pyo
from plotly import subplots
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.tsa.api as smt

from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Model
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import LSTM, Dense,Input,concatenate
from tensorflow.keras.layers import *
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import lightgbm as lgb

from datetime import datetime, timedelta


from baseFunctions import *
from data_helpers import processData6, featureEngineering, getSequencesFast, removeOutliers, create_sequences


In [None]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [None]:
data, propDicts, flippedPropDicts = processData6()
data, timeFeatures = featureEngineering(data,splits=[2,2,2,2])

In [None]:
def addSeasonality(dt, dec, outputLength):
    first365Vals = dec.seasonal[0:dt] 
    first365Vals = first365Vals / max(first365Vals) 

    folds = int(outputLength/ dt)
    rest = (outputLength % dt) 

    seasonalVals = np.ones((outputLength)) * np.nan
    for i in range(folds):
        firstId = i*dt
        second = firstId + dt
        seasonalVals[firstId:second] = first365Vals
    seasonalVals[second:second+rest] = first365Vals[0:rest]

    return seasonalVals

# data processing

In [None]:
""" create training data based on lagged features not 2 sequences """
trainF = [
       #'store_nbr', 'family', 
       #'sales', 
       'onpromotion',# 'dataT',
       #'city', 'state', 'type', 'cluster', 
       'dcoilwtico', 
       'holidayType',
       'description', 
       'transferred', 
       #'transactions', 
       'store_closed']
timeF = [
       'linear_time', 'day_of_year', 'day_of_year_f12_0', 'day_of_year_f104_0','day_of_year_f24_0',  'day_of_year_f52_0',
       'day_of_year_f12_180', 'day_of_year_f104_180','day_of_year_f24_180','day_of_year_f52_180', 
       'weekday', 'month'
       ]

initial_lag = 16 # 16 = independent from previous predictions
lags = 21
rolling = [7,14]

# Date string
date_string_val = "2017-07-01"#"2017-05-01"
date_string_test = "2016-09-01"


for familyId in [0]:#[6]: #data.family.unique():
       # start with only some families
       if familyId > 8:
          continue

       print(familyId)
       familyDf = data.loc[data.family==familyId]  

       for storeId in [1]:#[41]: #data.store_nbr.unique():
              print('store',storeId)
              storeDf = familyDf.loc[(familyDf.store_nbr == storeId)] 
              storeDf = storeDf.loc[(storeDf.dataT == 'train')]# & (storeDf.date > "2015-07-01")]

              mask = (storeDf.date >= date_string_val)
              storeDf.loc[mask,['dataT']] = 'val'

              # ln tranformation
              storeDf.loc[:,['logSales']] = np.log(storeDf.sales + 1)

              f = 'logSales'#'transactions'# 'salesOrig' #target
              y_trainArima = storeDf.loc[storeDf.dataT == 'train'][[f,'date']]
              y_trainArima = y_trainArima.set_index('date')
              y_trainArima.index = pd.DatetimeIndex(y_trainArima.index).to_period('D')

              model = SARIMAX(y_trainArima, order=(3, 1, 3), seasonal_order=(1,1,1,7))
              model_fit = model.fit()
              storeDf = storeDf.reset_index()
              
              arimaSalesLogT = np.array(model_fit.fittedvalues)
              arimaSalesLogT = np.concatenate((arimaSalesLogT, model_fit.forecast(steps=(storeDf.shape[0]- len(model_fit.fittedvalues)))), axis=0)

              storeDf.loc[:,['salesArima']] = arimaSalesLogT

              storeDf.loc[:,['ref']] = storeDf['salesArima']
              storeDf.loc[:,['target']] = storeDf['logSales'] - storeDf['ref']

              # lag features / how many past datapoints are we tain
              featuresForLag = ['target']
              lagF = []#trainF
              for i in range(lags):
                     lag = i+1+initial_lag
                     newF = [featuresForLag[j] + '_lag' + str(lag) for j in range(len(featuresForLag))]
                     lagF = lagF + newF
                     storeDf.loc[:,newF] = storeDf[featuresForLag].shift(lag).to_numpy()

              featuresForLag2 = ['salesArima']
              lagF2 = []#trainF
              for i in range(lags):
                     lag = i+1
                     newF = [featuresForLag2[j] + '_lag' + str(lag) for j in range(len(featuresForLag2))]
                     lagF2 = lagF2 + newF
                     storeDf.loc[:,newF] = storeDf[featuresForLag2].shift(lag).to_numpy()

              lagF = lagF + lagF2

              # rolling features
              rollingF = []
              for rol in rolling:
                     for i in range(len(lagF)):
                            #if 'sales_t-16'  in lagF[i]:
                            if 'target'  in lagF[i]:
                                   fm = lagF[i]+'_rollingM' + str(rol)
                                   fs = lagF[i]+'_rollingS' + str(rol)
                                   rollingF.append(fm)
                                   rollingF.append(fs)
                                   storeDf.loc[:,[fm]] = storeDf[lagF[i]].rolling(rol).mean()#.copy()
                                   storeDf.loc[:,[fs]] = storeDf[lagF[i]].rolling(rol).std()#.copy()


              allF = lagF + rollingF + timeF + trainF

              # we get a matrix that predicts only 1 timestamp -> stride it
              storeDf = storeDf.iloc[lags+initial_lag+max(rolling)+1:-1]

              train_subDf = storeDf.loc[storeDf.date < date_string_test]
              test_subDf  = storeDf.loc[storeDf.date >= date_string_test]
              val_subDf   = storeDf.loc[storeDf.dataT =='val']


targetF = 'target'
baseTrain = train_subDf[['ref']].to_numpy()
baseTest  = test_subDf[['ref']].to_numpy()
baseVal   = val_subDf[['ref']].to_numpy()
    

X_train = train_subDf[allF].to_numpy()
y_train = train_subDf[[targetF]].to_numpy()
X_test  =  test_subDf[allF].to_numpy()
y_test  =  test_subDf[[targetF]].to_numpy()  
X_val   =  val_subDf[allF].to_numpy()
y_val   =  val_subDf[[targetF]].to_numpy() 

np.isnan(X_train).any(),np.isnan(X_test).any(),np.isnan(X_val).any(),np.isnan(y_train).any(),np.isnan(y_test).any(),np.isnan(y_val).any()

# analyze residual

In [None]:
p_value = test_stationarity(train_subDf.target, 7, True)

In [None]:
p_value = test_stationarity(train_subDf.target.diff(21).dropna(), 14, True)

In [None]:
plot_periodogram(train_subDf.target.diff(21).fillna(0), 365)

In [None]:
tsplot(train_subDf.target.diff(16).fillna(0), lags=100)

In [None]:
dec = sm.tsa.seasonal_decompose(train_subDf.logSales,period = 365, model = 'additive')
dec.plot()

In [None]:
train_subDf['seasonal'] = dec.seasonal
train_subDf.seasonal = train_subDf.seasonal / max(train_subDf.seasonal)
train_subDf['res_seasonalAnalysis'] = train_subDf.logSales - dec.seasonal

In [None]:
def addSeasonality(dt, dec, outputLength):
    first365Vals = dec.seasonal[0:dt] 
    first365Vals = first365Vals / max(first365Vals) 

    folds = int(outputLength/ dt)
    rest = (outputLength % dt) 

    seasonalVals = np.ones((outputLength)) * np.nan
    for i in range(folds):
        firstId = i*dt
        second = firstId + dt
        seasonalVals[firstId:second] = first365Vals
    seasonalVals[second:second+rest] = first365Vals[0:rest]

    return seasonalVals

train_subDf['seasonal2'] = addSeasonality(365, dec, train_subDf.shape[0])

In [None]:
train_subDf.seasonal.plot()

In [None]:
plot_periodogram(train_subDf.res_seasonalAnalysis, 365)

In [None]:
p_value = test_stationarity(train_subDf.res_seasonalAnalysis, 14, True)

# LGBM fit

In [None]:
 # compare against base lgbm that just predicts always t+16

def calcLossLGBM(pred, y, logTransform, predictDiff, base):
    if predictDiff:
        pred = np.reshape(pred, (pred.shape[0],1)) + base
        y = y + base

    if logTransform:
        a = np.exp(pred) -1
        y = np.exp(y) -1 
    else:
        a = (pred)
        y = (y)

    if (a < 0).any():
        a = np.clip(a, 0, 1e20)
    rmsleTrain = np.sqrt(mean_squared_log_error(a,y))
    return rmsleTrain
def plotLGBM(i, logTransform, pred, y, predictDiff, base):
    if predictDiff:
        pred = np.reshape(pred, (pred.shape[0],1)) + base
        y = y + base
    if logTransform:
        a = np.exp(pred[i:i+16])
        y = np.exp(y[i:i+16])
    else:
        a = (pred[i:i+16])
        y = (y[i:i+16])
    x = range(len(a))   

    fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))
    axs.plot(x, y, color='blue',label='Original')
    axs.plot(x, a, color='red',label='pred')
    axs.set_title('index: '+str(i))
    fig.subplots_adjust(hspace=0.5)

# Set parameters for LGBM model
params = {
    'boosting':'gbdt',#'gbdt', #'rf' #'dart'
    'objective': 'regression',  # Assuming you're doing regression
    'metric': 'mse',  # Mean squared error
    'num_leaves': 15,
    #'lambda_l1': 0.1,
    #'lambda_l2': 0.2,
    #'max_depth':10,
    #'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'force_col_wise':True,
    'num_iterations':100
}   

# Train the model
gbm = lgb.train(params, lgb.Dataset(X_train, label=y_train), valid_sets=[
    lgb.Dataset(X_test, label=y_test)
    #,lgb.Dataset(X_val, label=y_val)
    ]#,num_boost_round=100
,callbacks=[lgb.early_stopping(stopping_rounds=100)]
)  
predtrainLGBM = gbm.predict(X_train)
predtestLGBM = gbm.predict(X_test)
predvalLGBM = gbm.predict(X_val)
logTransform=True
predictDiff=True

print('errors: ', calcLossLGBM(predtrainLGBM, y_train, logTransform, predictDiff, base=baseTrain), calcLossLGBM(predtestLGBM, y_test, logTransform, predictDiff, baseTest), calcLossLGBM(predvalLGBM, y_val, logTransform, predictDiff, baseVal))

for i in range(1):
    plotLGBM(i*16, logTransform, predtrainLGBM, y_train, predictDiff, baseTrain)
    plotLGBM(i*16, logTransform, predtestLGBM, y_test, predictDiff, baseTest)
    plotLGBM(i*16, logTransform, predvalLGBM, y_val, predictDiff, baseVal)


In [None]:
# errors:  0.39217121993164555 0.4369127610627825 0.46647451624311603 initial lag = 0 -> 
# errors:  0.39779196439513415 0.44181336636642143 0.44775333790913957 initial lag 16

In [None]:
importances = gbm.feature_importance()
for name, importance in zip(allF, importances):
    print(f'{name}: {importance}')

# create predictions

In [None]:
""" create training data based on lagged features not 2 sequences """
trainF = [
       #'store_nbr', 'family', 
       #'sales', 
       'onpromotion',# 'dataT',
       #'city', 'state', 'type', 'cluster', 
       'dcoilwtico', 
       'holidayType',
       'description', 
       'transferred', 
       #'transactions', 
       'store_closed']
timeF = [
       'linear_time', 'day_of_year', 'day_of_year_f12_0', 'day_of_year_f104_0','day_of_year_f24_0',  'day_of_year_f52_0',
       'day_of_year_f12_180', 'day_of_year_f104_180','day_of_year_f24_180','day_of_year_f52_180', 
       'weekday', 'month'
       ]

initial_lag = 16 # 16 = independent from previous predictions
lags = 10 #21
rolling = [7]

# Date string
date_string_test = "2017-07-01"

data0 = data#.loc[(data.date > "2015-07-01")]

predictions = []
log={}


for familyId in [0]:#[6]: #data.family.unique():
       # start with only some families

       print(familyId)
       familyDf = data0.loc[data0.family==familyId]  
       log1={}

       for storeId in data.store_nbr.unique():
            #print('store',storeId)
            storeDf = familyDf.loc[(familyDf.store_nbr == storeId)] 

            # ln tranformation
            storeDf.loc[:,['logSales']] = np.log(storeDf.sales + 1)

            f = 'logSales'#'transactions'# 'salesOrig' #target
            y_trainArima = storeDf.loc[storeDf.dataT == 'train'][[f,'date']]
            y_trainArima = y_trainArima.set_index('date')
            y_trainArima.index = pd.DatetimeIndex(y_trainArima.index).to_period('D')
            model = ARIMA(y_trainArima, order=(5, 1, 5))
            model_fit = model.fit()
            
            arimaSalesLogT = np.array(model_fit.fittedvalues)
            arimaSalesLogT = np.concatenate((arimaSalesLogT, model_fit.forecast(steps=(16))), axis=0)
            storeDf.loc[:,['salesArima']] = arimaSalesLogT
            storeDf.loc[:,['ref']] = storeDf['salesArima']
            storeDf.loc[:,['target']] = storeDf['logSales'] - storeDf['ref']

            # lag features / how many past datapoints are we tain
            featuresForLag = ['target']
            lagF = []#trainF
            for i in range(lags):
                   lag = i+1+initial_lag
                   newF = [featuresForLag[j] + '_lag' + str(lag) for j in range(len(featuresForLag))]
                   lagF = lagF + newF
                   storeDf.loc[:,newF] = storeDf[featuresForLag].shift(lag).to_numpy()
            
            # rolling features
            rollingF = []
            for rol in rolling:
                   for i in range(len(lagF)):
                          #if 'sales_t-16'  in lagF[i]:
                          if 'target'  in lagF[i]:
                                 fm = lagF[i]+'_rollingM' + str(rol)
                                 fs = lagF[i]+'_rollingS' + str(rol)
                                 rollingF.append(fm)
                                 rollingF.append(fs)
                                 storeDf.loc[:,[fm]] = storeDf[lagF[i]].rolling(rol).mean()#.copy()
                                 storeDf.loc[:,[fs]] = storeDf[lagF[i]].rolling(rol).std()#.copy()

            allF = lagF + rollingF + timeF + trainF

            # we get a matrix that predicts only 1 timestamp -> stride it
            storeDf = storeDf.iloc[lags+initial_lag+max(rolling)+1:storeDf.shape[0]]
            train_subDf = storeDf.loc[storeDf.date < date_string_test]
            test_subDf  = storeDf.loc[(storeDf.date >= date_string_test) & (storeDf.dataT =='train')]
            pred_subDf   = storeDf.loc[storeDf.dataT =='test']


            targetF = 'target'
            baseTrain = train_subDf[['ref']].to_numpy()
            baseTest  = test_subDf[['ref']].to_numpy()
            
            X_train = train_subDf[allF].to_numpy()
            y_train = train_subDf[[targetF]].to_numpy()
            X_test  =  test_subDf[allF].to_numpy()
            y_test  =  test_subDf[[targetF]].to_numpy()  


            baseSub = pred_subDf[['ref']].to_numpy()
            X_sub   = pred_subDf[allF].to_numpy()

            # Set parameters for LGBM model
            params = {
                'boosting':'gbdt',#'gbdt', #'rf' #'dart'
                'objective': 'regression',  # Assuming you're doing regression
                'metric': 'mse',  # Mean squared error
                'num_leaves': 15,
                'verbose': -1,
                'force_col_wise':True,
                'num_iterations':200
            }   
            
            # Train the model
            gbm = lgb.train(params, lgb.Dataset(X_train, label=y_train), valid_sets=[
                lgb.Dataset(X_test, label=y_test)
                #,lgb.Dataset(X_val, label=y_val)
                ]
            ,callbacks=[lgb.early_stopping(stopping_rounds=30)]
            )  
            predtrainLGBM = gbm.predict(X_train)
            predtestLGBM = gbm.predict(X_test)
            logTransform=True
            predictDiff=True
            
            lossTrain = calcLossLGBM(predtrainLGBM, y_train, logTransform, predictDiff, base=baseTrain)
            lossTest = calcLossLGBM(predtestLGBM, y_test, logTransform, predictDiff, baseTest)
            print('errors: ', lossTrain, lossTest)

            log2 = {}
            log2['testL'] = lossTest
            log2['trainL'] = lossTrain
            log1[storeId] = log2

            predvalLGBM = gbm.predict(X_sub)
            pred = np.reshape(predvalLGBM, baseSub.shape) + baseSub
            a = np.exp(pred)-1

            pred_subDf.loc[:,['sales']] = a
            predictions.append(pred_subDf[['id','sales']])

       print(log1)
       log[familyId] = log1

            



In [None]:
log1

In [None]:
predDf2 = pd.concat(predictions)

In [None]:
a = pd.read_csv('simpleArima_logT.csv')

In [None]:
a

In [None]:
predDf = predDf2.set_index('id')
b = a.set_index('id')

In [None]:
c = pd.merge(predDf, b, on='id', how='outer')

In [None]:
c['sales'] = c['sales_x'].fillna(c['sales_y'])

In [None]:
d = c.drop(['sales_x','sales_y'], axis =1)

In [None]:
d.to_csv('arima_prediction_and_updatedFam0WithLGBM_useFullData.csv')