In [None]:
import tensorflow as tf
tf.random.set_seed(42)
import numpy as np
np.random.seed(42)
import time
import pickle

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime
import plotly.offline as pyo
from plotly import subplots
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.tsa.api as smt

from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from statsmodels.tsa.statespace.sarimax import SARIMAX

from datetime import datetime, timedelta


from baseFunctions import *
from data_helpers import processData6, featureEngineering, getSequencesFast, removeOutliers, create_sequences


In [None]:
data, propDicts, flippedPropDicts = processData6()
data, timeFeatures = featureEngineering(data,splits=[2,2,2,2])

In [None]:
def calcLossArima(fitted, y_train, logTransform):
    if not logTransform:
        rmsleTrain = np.sqrt(mean_squared_log_error(fitted,y_train))
    else:
        y_train = np.reshape(y_train, fitted.shape)
        rmsleTrain =  np.sqrt(np.mean((fitted-y_train)**2))
    return rmsleTrain

def plotArima(i, pred, y, len):
    y = np.reshape(y, pred.shape)
    x = range(len)   

    fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))
    axs.plot(x, np.exp(y[i:i+len])-1, color='blue',label='Original')
    axs.plot(x, np.exp(pred[i:i+len])-1, color='red',label='pred')
    axs.set_title('index: '+str(i))
    fig.subplots_adjust(hspace=0.5)

In [None]:
# log transform & separation of data
data['logSales'] = np.log(data.sales +1)

date = "2017-07-01"
familyId = 0
storeId = 1
train_subDf = data.loc[(data.dataT == 'train') & (data.family == familyId) & (data.store_nbr == storeId) & (data.date <= date)]
test_subDf = data.loc[(data.dataT == 'train')  & (data.family == familyId) & (data.store_nbr == storeId) & (data.date > date)]

# ARIMA experiments

In [None]:
from statsmodels.tsa.arima.model import ARIMA

In [None]:
f = 'logSales'#'transactions'# 'salesOrig' #target
logTransform = f == 'logSales'
y_trainArima = train_subDf[[f]]
y_testArima = test_subDf[[f]]

model = ARIMA(y_trainArima, order=(4, 1, 4))
model_fit = model.fit()

In [None]:
fitted_values = model_fit.fittedvalues
predicted_values = model_fit.forecast(steps=len(y_testArima))
print('errors: ', calcLossArima(fitted_values, y_trainArima, logTransform), calcLossArima(predicted_values, y_testArima, logTransform))
#plotArima(0, predicted_values, y_testArima, 10)

In [None]:
#errors:  0.633163370502466  0.5900711105335015  for 5,1,5 Arima  -> 5,1,5 seems best
#errors:  0.6334058506575241 0.5930226716168197  for 4,1,4
#errors:  0.6260654735614449 0.6001774588517368  for 10,1,10
#errors:  0.6358046366759819 0.5964152567744612  for 3,1,3
#errors:  0.6474186219187706 0.6044976113096909  for 1,1,1
#errors:  0.6414325407019292 0.6160611543945549  for 6,1,6

# ARIMA prediction

In [None]:
log = {}
predictions = []
for familyId in data.family.unique():
    print(familyId)
    familyDf = data.loc[data.family==familyId]
    log1 ={}

    for storeId in data.store_nbr.unique():
        print('store',storeId)
        storeDf = familyDf.loc[(familyDf.store_nbr == storeId) & (storeDf.date > "2015-07-01")]
        train = storeDf.loc[storeDf.dataT =='train']
        test = storeDf.loc[storeDf.dataT =='test']

        y_trainArima = np.log(train.sales+1)

        model = ARIMA(y_trainArima, order=(5, 1, 5))
        model_fit = model.fit()
        fitted_values = model_fit.fittedvalues
        log1[storeId] = calcLossArima(fitted_values, y_trainArima, True)
        predicted_values = model_fit.forecast(steps=16)

        test['sales'] = np.reshape(np.exp(predicted_values)-1, (-1,1))
        predictions.append(test[['id','sales']])

    print(log1)
    log[familyId] = log1


# SARIMA

In [None]:
date = "2017-07-01"
familyId = 6
storeId = 41
#data.index = pd.DatetimeIndex(data.index).to_period('D')
train_subDf = data.loc[(data.dataT == 'train') & (data.family == familyId) & (data.store_nbr == storeId) & (data.date <= date)]
test_subDf = data.loc[(data.dataT == 'train')  & (data.family == familyId) & (data.store_nbr == storeId) & (data.date > date)]

In [None]:
f = 'logSales'#'transactions'# 'salesOrig' #target
logTransform = f == 'logSales'
y_trainSarima = train_subDf[[f]]
#y_trainSarima.index = pd.DatetimeIndex(y_trainSarima.index).to_period('D')
y_testSarima = test_subDf[[f]]

model = SARIMAX(y_trainSarima, order=(3, 1, 3), seasonal_order=(1,1,1,7)) # Change the order parameters as needed
model_fit = model.fit(disp=0)


In [None]:
model_fit.plot_diagnostics()

In [None]:
fitted_values = model_fit.fittedvalues
predicted_values = model_fit.forecast(steps=len(y_testSarima))
print('errors: ', calcLossArima(fitted_values, y_trainSarima, logTransform), calcLossArima(predicted_values, y_testSarima, logTransform))

In [None]:
# errors:  0.6247310111892052 0.5817735893367159  (5,1,5)(3,0,3,7)
# errors:  0.6357538444884834 0.5629046800895323  (5,1,5)(1,1,1,7)
# errors:  0.6314627377125411 0.5598850200083434  (5,1,5)(3,1,3,7)
# errors:  0.6318720245837718 0.5509787578371821  (5,1,5)(5,1,5,7)
# errors:  0.6442797678835153 0.5766299011076879  (5,1,5)(5,1,5,14)


# (5,1,5)(5,1,5,7) ~80s
#errors:  0.4879433387886212 0.38981586396161283  (5,1,5)(3,1,3,7) ~25s
#errors:  0.4881281152176729 0.39805714173553747  (3,1,3)(3,1,3,7) ~19s
#errors:  0.48888721375855865 0.38608270415221047 (3,1,3)(1,1,1,7) ~5s -> 3-4h
#vs AMIRA 
#errors:  0.4955536970606146 0.4396591487588635


# predict all values with sarima

In [None]:
""" predict with all data for now """
log = {}
predictions = []
for familyId in data.family.unique():
    print('----family ----',familyId)
    familyDf = data.loc[data.family==familyId]
    log1 ={}

    for storeId in data.store_nbr.unique():
        print('----store ----',storeId)
        storeDf = familyDf.loc[(familyDf.store_nbr == storeId)]# & (storeDf.date > "2015-07-01")]
        train = storeDf.loc[storeDf.dataT =='train']
        test = storeDf.loc[storeDf.dataT =='test']

        y_trainSarima = np.log(train.sales+1)

        model = SARIMAX(y_trainSarima, order=(3, 1, 3), seasonal_order=(1,1,1,7))
        model_fit = model.fit(disp=0)
        fitted_values = model_fit.fittedvalues
        log1[storeId] = calcLossArima(fitted_values, y_trainSarima, True)
        predicted_values = model_fit.forecast(steps=16)

        test.loc[:,['sales']] = np.reshape(np.exp(predicted_values)-1, (-1,1))
        predictions.append(test[['id','sales']])

    print(log1)
    log[familyId] = log1


In [None]:
predDf2 = pd.concat(predictions)
a = predDf2.set_index('id')
a.to_csv('simpleSarima_logT_313_1117.csv')

In [None]:
log

# Garch

In [None]:
!pip install statsmodels arch pandas

In [None]:
from arch import arch_model

In [None]:
f = 'logSales'#'transactions'# 'salesOrig' #target
logTransform = f == 'logSales'
y_trainGarch= train_subDf[[f]]
y_testGarch = test_subDf[[f]]

# Assuming 'returns' is your time series data
model = arch_model(y_trainGarch, vol='Garch', p=5, q=5) # Change the order parameters as needed
model_fit = model.fit()
print(model_fit.summary())

In [None]:
fitted_values = model_fit.fittedvalues
predicted_values = model_fit.forecast(steps=len(y_testGarch))
print('errors: ', calcLossArima(fitted_values, y_trainGarch, logTransform), calcLossArima(predicted_values, y_testGarch, logTransform))