In [None]:
import tensorflow as tf
tf.random.set_seed(42)
import numpy as np
np.random.seed(42)
import time
import pickle

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime
import plotly.offline as pyo
from plotly import subplots
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.tsa.api as smt

from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Model
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import LSTM, Dense,Input,concatenate
from tensorflow.keras.layers import *
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import lightgbm as lgb

from datetime import datetime, timedelta
from sklearn.model_selection import KFold


from baseFunctions import *
from data_helpers import processData6, featureEngineering, getSequencesFast, removeOutliers, create_sequences


In [None]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

In [None]:
data, propDicts, flippedPropDicts = processData6()
data, timeFeatures = featureEngineering(data,splits=[2,2,2,2])

In [None]:
def addSeasonality(dt, dec, outputLength):
    first365Vals = dec.seasonal[0:dt] 
    first365Vals = first365Vals / max(first365Vals) 

    folds = int(outputLength/ dt)
    rest = (outputLength % dt) 

    seasonalVals = np.ones((outputLength)) * np.nan
    for i in range(folds):
        firstId = i*dt
        second = firstId + dt
        seasonalVals[firstId:second] = first365Vals
    seasonalVals[second:second+rest] = first365Vals[0:rest]

    return seasonalVals

In [None]:
def calcLossArima(pred, y, logTransform):
    if logTransform:
        a = np.exp(pred) -1
        y = np.exp(y) -1 
    else:
        a = (pred)
        y = (y)

    if (a < 0).any():
        a = np.clip(a, 0, 1e20)

    rmsleTrain = np.sqrt(mean_squared_log_error(a,y))
    return rmsleTrain

# data processing

### all data

In [None]:
def fit_sarimax(group):
    """
    Fit a SARIMAX model to each group (time series) and make predictions.
    """
    train = group[group['dataT'] != 'test']
    test = group[group['dataT'] =='test']
    
    model = SARIMAX(train['logSales'], order=(3, 1, 3), seasonal_order=(1, 1, 1, 7))
    model_fit = model.fit()
    
    predictions = model_fit.forecast(len(test))
    
    group.loc[train.index,'salesArima'] = model_fit.fittedvalues
    group.loc[test.index, 'salesArima'] = np.reshape(predictions, (len(test),1))
    
    return group

""" calc error or arima prediction """
def calcArimaErrorFullDf(data2):
    data2['target'] = data2['logSales'] - data2['salesArima']
    data2['targetSquared'] = data2['target']**2
    
    a = data2.groupby(['store_nbr','family'])['targetSquared'].mean()
    print(np.sqrt(a.mean()))
    return a

In [None]:
# takes 190min!! -> use multicore in the future
data1 = data1.groupby(['store_nbr','family']).apply(fit_sarimax)
data1.to_csv('data_enriched_with_sarima313_1117.csv')
data2 = data1.set_index('id').reset_index()

In [None]:
def processAllData(data1, targetLags, featureLags, rolling, initial_lag):
       grouped = data1.groupby(['store_nbr','family'])

       data1['transactions'] = (data1.transactions - grouped.transactions.transform('mean')) / grouped.transactions.transform('std')
       data1['linear_time'] = (data1['linear_time'] - grouped.linear_time.transform('mean')) / grouped.linear_time.transform('std')
       data1['day_of_year'] = (data1['day_of_year'] - grouped.day_of_year.transform('mean')) / grouped.day_of_year.transform('std')
       data1['dcoilwtico'] = (data1['dcoilwtico'] - grouped.dcoilwtico.transform('mean')) / grouped.dcoilwtico.transform('std')

       #mask = (data1.date >= date_string_val) & (data1.dataT == 'train')
       #data1.loc[mask,['dataT']] = 'val'

       data1.loc[:,['logSales']] = np.log(data1.sales + 1)

       arimaPred = pd.read_csv('sarima_313_117_and_id.csv')
       data1 = pd.merge(data1, arimaPred[['id','sales','salesArima']], on=['id','sales'], how='left')
       print('loaded arima data')

       data1['ref'] = data1['salesArima']
       data1['target'] = data1['logSales'] - data1['ref']
       

       featuresForLag = ['target']
       lagF_target = []
       for l in targetLags:
              lag = l + initial_lag
              newF = [featuresForLag[j] + '_lag' + str(lag) for j in range(len(featuresForLag))]
              lagF_target = lagF_target + newF
              data1[newF] = data1.groupby(['store_nbr','family'])[featuresForLag].shift(lag)

       featuresForLag2 = ['salesArima','onpromotion','dcoilwtico']
       lagF_features = []
       for i in featureLags:
              lag = i
              newF = [featuresForLag2[j] + '_lag' + str(lag) for j in range(len(featuresForLag2))]
              lagF_features = lagF_features + newF
              data1[newF] = data1.groupby(['store_nbr','family'])[featuresForLag2].shift(lag)

       lagF = lagF_target + lagF_features

       rollingF = []
       for rol in rolling:
              for i in range(len(lagF)):
                     #if 'sales_t-16'  in lagF[i]:
                     if 'target'  in lagF[i] or 'dcoilwtico'  in lagF[i] or 'onpromotion'  in lagF[i]:
                            fm = lagF[i]+'_rollingM' + str(rol)
                            fs = lagF[i]+'_rollingS' + str(rol)
                            rollingF.append(fm)
                            rollingF.append(fs)
                            data1[fm] = data1.groupby(['store_nbr','family'])[lagF[i]].rolling(rol, min_periods=1).mean().reset_index(drop=True)#.set_index('id')#.reset_index() #transform('mean') #lambda x: x.rolling(rol).mean()).to_numpy()
                            data1[fs] = data1.groupby(['store_nbr','family'])[lagF[i]].rolling(rol, min_periods=1).std().reset_index(drop=True)

       data3 = data1.groupby(['store_nbr','family']).apply(lambda x: x.iloc[max(max(targetLags),max(featureLags))+initial_lag+1:])
       data3 = data3.set_index('id').reset_index()

       return data3, lagF + rollingF


In [None]:
initial_lag = 16 # 16 = independent from previous predictions
targetLags = [1,2,3,4,5,6,7,8,9,10]
featureLags = [1,2,3,4,5,6,7,8,9,10]
rolling = [7,21]

allData, addedF = processAllData(data, targetLags, featureLags, rolling, initial_lag)

In [None]:
allData[['target','zscaleTaget']].hist(bins = 100)

In [None]:
allData['meanTarget'] = allData.groupby(['store_nbr','family'])['target'].transform('mean')
allData['stdTarget'] = allData.groupby(['store_nbr','family'])['target'].transform('std')
allData['zscaleTaget'] = (allData['target']-allData['meanTarget']) / allData['stdTarget']  # hard to normalize over std! creating lots of nans
allData.zscaleTaget.isna().sum()

In [None]:
allData.stdTarget.hist(bins =100)

In [None]:
a = calcArimaErrorFullDf(allData)

In [None]:
a.sort_values()

In [None]:
# Date string
date_string_test = "2017-08-01"

train_bigDf = allData.loc[(allData.date < date_string_test) & (allData.dataT =='train')]
test_bigDf  = allData.loc[(allData.date >= date_string_test) & (allData.dataT !='test')]
val_bigDf   = allData.loc[ allData.dataT =='test']

trainF = [
       'store_nbr', 'family', 
       #'sales', 
       'onpromotion',# 'dataT',
       'city', 'state', 'type', 'cluster', 
       'dcoilwtico', 
       'holidayType',
       'description', 
       'transferred', 
       #'transactions', 
       'store_closed']
timeF = [
       'linear_time', 'day_of_year', 'day_of_year_f12_0', 'day_of_year_f104_0','day_of_year_f24_0',  'day_of_year_f52_0',
       'day_of_year_f12_180', 'day_of_year_f104_180','day_of_year_f24_180','day_of_year_f52_180', 
       'weekday', 'month'
       ]

allF = addedF + trainF + timeF

targetF, refF = 'zscaleTaget', 'ref'
logTransform = True
predictDiff = True
X_train, X_test, X_pred = train_bigDf[allF],test_bigDf[allF],val_bigDf[allF]
y_train, y_test = train_bigDf[targetF], test_bigDf[targetF]
baseTrain, baseTest, basePred = train_bigDf[refF], test_bigDf[refF],val_bigDf[refF]

### per group

In [None]:
def processDataArima(storeDf, date_string_val, date_string_test, targetLags, featureLags):
              trainF = [
                     #'store_nbr', 'family', 
                     #'sales', 
                     'onpromotion',# 'dataT',
                     #'city', 'state', 'type', 'cluster', 
                     'dcoilwtico', 
                     'holidayType',
                     'description', 
                     'transferred', 
                     #'transactions', 
                     'store_closed']
              timeF = [
                     'linear_time', 'day_of_year', 'day_of_year_f12_0', 'day_of_year_f104_0','day_of_year_f24_0',  'day_of_year_f52_0',
                     'day_of_year_f12_180', 'day_of_year_f104_180','day_of_year_f24_180','day_of_year_f52_180', 
                     'weekday', 'month'
                     ]

              storeDf['linear_time'] = (storeDf['linear_time'] - storeDf.linear_time.mean()) /storeDf.linear_time.std()
              storeDf['day_of_year'] = (storeDf['day_of_year'] - storeDf.day_of_year.mean()) /storeDf.day_of_year.std()
              storeDf['dcoilwtico'] = (storeDf['dcoilwtico'] - storeDf.dcoilwtico.mean()) /storeDf.dcoilwtico.std()
              storeDf['transactions'] = (storeDf['transactions'] - storeDf.transactions.mean()) /storeDf.transactions.std()

              mask = (storeDf.date >= date_string_val)
              storeDf.loc[mask,['dataT']] = 'val'

              # ln tranformation
              storeDf.loc[:,['logSales']] = np.log(storeDf.sales + 1)

              f = 'logSales'#'transactions'# 'salesOrig' #target
              y_trainArima = storeDf.loc[storeDf.dataT == 'train'][[f,'date']]
              y_trainArima = y_trainArima.set_index('date')
              y_trainArima.index = pd.DatetimeIndex(y_trainArima.index).to_period('D')

              model = SARIMAX(y_trainArima, order=(3, 1, 3), seasonal_order=(1,1,1,7))

              model_fit = model.fit()
              storeDf = storeDf.reset_index()
              
              arimaSalesLogT = np.array(model_fit.fittedvalues)
              print(storeDf.shape[0]- len(model_fit.fittedvalues), 'arima predicted')
              arimaSalesLogT = np.concatenate((arimaSalesLogT, model_fit.forecast(steps=(storeDf.shape[0]- len(model_fit.fittedvalues)))), axis=0)

              storeDf.loc[:,['salesArima']] = arimaSalesLogT

              storeDf.loc[:,['ref']] = storeDf['salesArima']
              storeDf.loc[:,['target']] = storeDf['logSales'] - storeDf['ref']

              # lag features / how many past datapoints are we tain
              featuresForLag = ['target']
              lagF_target = []#trainF
              for i in targetLags:
                     lag = i+initial_lag
                     newF = [featuresForLag[j] + '_lag' + str(lag) for j in range(len(featuresForLag))]
                     lagF_target = lagF_target + newF
                     storeDf.loc[:,newF] = storeDf[featuresForLag].shift(lag).to_numpy()

              featuresForLag2 = ['salesArima','onpromotion','dcoilwtico']
              lagF_features = []#trainF
              for i in featureLags:
                     lag = i
                     newF = [featuresForLag2[j] + '_lag' + str(lag) for j in range(len(featuresForLag2))]
                     lagF_features = lagF_features + newF
                     storeDf.loc[:,newF] = storeDf[featuresForLag2].shift(lag).to_numpy()

              lagF = lagF_target + lagF_features

              # rolling features
              rollingF = []
              for rol in rolling:
                     for i in range(len(lagF)):
                            #if 'sales_t-16'  in lagF[i]:
                            if 'target'  in lagF[i] or 'dcoilwtico'  in lagF[i] or 'onpromotion'  in lagF[i]:
                                   fm = lagF[i]+'_rollingM' + str(rol)
                                   fs = lagF[i]+'_rollingS' + str(rol)
                                   rollingF.append(fm)
                                   rollingF.append(fs)
                                   storeDf.loc[:,[fm]] = storeDf[lagF[i]].rolling(rol).mean()#.copy()
                                   storeDf.loc[:,[fs]] = storeDf[lagF[i]].rolling(rol).std()#.copy()


              allF = lagF + rollingF + timeF + trainF

              # we get a matrix that predicts only 1 timestamp -> stride it
              storeDf = storeDf.iloc[max(max(targetLags),max(featureLags))+initial_lag+max(rolling)+1:storeDf.shape[0]]

              train_subDf = storeDf.loc[storeDf.date < date_string_test]
              test_subDf  = storeDf.loc[storeDf.date >= date_string_test]
              val_subDf   = storeDf.loc[storeDf.dataT =='val']

              return train_subDf, test_subDf, val_subDf, allF, storeDf
    

In [None]:
initial_lag = 16 # 16 = independent from previous predictions
targetLags = [1,2,3,4,5,6,7,8,9,10]
featureLags = [1,2,3,4,5,6,7,8,9,10]
rolling = [7,21]

# Date string
date_string_val = "2017-07-31"#"2017-05-01" # no arima fit on this one
date_string_test = "2016-09-01"

logTransform = True
predictDiff = False # predict diff between arima & sales tho


for familyId in [0]:#[6]: #data.family.unique():
       # start with only some families
       if familyId > 8:
          continue

       print(familyId)
       familyDf = data.loc[data.family==familyId]  

       for storeId in [1]:#[41]: #data.store_nbr.unique():
              print('store',storeId)
              storeDf = familyDf.loc[(familyDf.store_nbr == storeId)] 
              storeDf = storeDf.loc[(storeDf.dataT == 'train')]# & (storeDf.date > "2015-07-01")]
              train_subDf, test_subDf, val_subDf, allF, modifiedDf = processDataArima(storeDf, date_string_val, date_string_test, targetLags, featureLags)




targetF = 'target'
baseTrain = train_subDf[['ref']].to_numpy()
baseTest  = test_subDf[['ref']].to_numpy()
baseVal   = val_subDf[['ref']].to_numpy()
    

X_train = train_subDf[allF]
y_train = train_subDf[targetF].to_numpy()
X_test  =  test_subDf[allF]
y_test  =  test_subDf[targetF].to_numpy()  
X_val   =  val_subDf[allF]
y_val   =  val_subDf[targetF].to_numpy() 

#np.isnan(X_train).any(),np.isnan(X_test).any(),np.isnan(X_val).any(),np.isnan(y_train).any(),np.isnan(y_test).any(),np.isnan(y_val).any()
print('arima errors: ', calcLossArima(train_subDf.ref, train_subDf.logSales, logTransform, base=baseTrain), calcLossArima(test_subDf.ref, test_subDf.logSales, logTransform, baseTest), calcLossArima(val_subDf.ref, val_subDf.logSales, logTransform, baseTest))

In [None]:
train_subDf[['sales','onpromotion','store_closed','date']].set_index('date').plot(), test_subDf[['sales','onpromotion','store_closed','date']].set_index('date').plot(),val_subDf[['sales','onpromotion','store_closed','date']].set_index('date').plot()

# analyze residual

In [None]:
train_bigDf['salesPred'] = np.exp(train_bigDf['LGBMpred'] + train_bigDf['salesArima'])-1
test_bigDf['salesPred'] = np.exp(test_bigDf['LGBMpred'] + test_bigDf['salesArima'])-1
val_bigDf['salesPred'] = np.exp(val_bigDf['LGBMpred'] + val_bigDf['salesArima'])-1

In [None]:
train_bigDf[['sales','salesPred']].plot()

In [None]:
train_bigDf['err'] = (np.log(train_bigDf.sales+1) - np.log(train_bigDf.salesPred + 1))**2
test_bigDf['err'] = (np.log(test_bigDf.sales+1) - np.log(test_bigDf.salesPred + 1))**2
val_bigDf['err'] = (np.log(val_bigDf.sales+1) - np.log(val_bigDf.salesPred + 1))**2
np.sqrt(train_bigDf['err'].mean()),np.sqrt(test_bigDf['err'].mean()),np.sqrt(val_bigDf['err'].mean())

In [None]:
val_bigDf.loc[(val_bigDf.store_nbr == 32) & (val_bigDf.family==11)][['sales','salesPred']].plot()

In [None]:
test_bigDf.loc[(test_bigDf.store_nbr == 32) & (test_bigDf.family==11)][['sales','salesPred']].plot()

In [None]:
train_bigDf.loc[(train_bigDf.store_nbr == 32) & (train_bigDf.family==11)][['sales','salesPred']].plot()

In [None]:
val_bigDf['sales'] = val_bigDf['salesPred']
val_bigDf['sales'] = np.clip(val_bigDf.sales, 0, 1e30)

In [None]:
val_bigDf[['sales','id']].set_index('id').to_csv('allData_arima_lgbm_pred_nLeaves20.csv')

In [None]:
p_value = test_stationarity(train_subDf.target, 7, True)

In [None]:
p_value = test_stationarity(train_subDf.target.diff(21).dropna(), 14, True)

In [None]:
plot_periodogram(train_subDf.target.diff(21).fillna(0), 365)

In [None]:
tsplot(train_subDf.target.diff(16).fillna(0), lags=100)

In [None]:
dec = sm.tsa.seasonal_decompose(train_subDf.logSales,period = 365, model = 'additive')
dec.plot()

In [None]:
train_subDf['seasonal'] = dec.seasonal
train_subDf.seasonal = train_subDf.seasonal / max(train_subDf.seasonal)
train_subDf['res_seasonalAnalysis'] = train_subDf.logSales - dec.seasonal

In [None]:
train_subDf.seasonal.plot()

In [None]:
plot_periodogram(train_subDf.res_seasonalAnalysis, 365)

In [None]:
p_value = test_stationarity(train_subDf.res_seasonalAnalysis, 14, True)

# LGBM fit
- doesn't really matter if t-1 or t-16

In [None]:
def calcLossLGBM(pred, y):
    logPred = np.reshape(pred, (pred.shape[0],1))
    a = np.exp(logPred) -1

    logy = np.reshape(y, (pred.shape[0],1))
    y = np.exp(logy) -1

    rmsleTrain = np.sqrt(np.mean((a-y)**2))
    return rmsleTrain
def mse(pred,y):
    return np.sqrt(np.mean((pred-y)**2))
def calcLossLGBMArima1(pred, sales, arima, predictDiff):
    if predictDiff:
        pred = np.reshape(pred, (pred.shape[0],1)) + np.reshape(arima, (pred.shape[0],1))
    a = np.exp(pred) -1

    if (a < 0).any():
        a = np.clip(a, 0, 1e20)
    rmsleTrain = np.sqrt(mean_squared_log_error(a,sales))
    return rmsleTrain
def calcLossLGBMArima2(pred, y, arima, predictDiff):
    if predictDiff:
        pred = np.reshape(pred, (pred.shape[0],1)) + np.reshape(arima, (pred.shape[0],1))
        y = np.reshape(y, (pred.shape[0],1)) + np.reshape(arima, (pred.shape[0],1))
    a = np.exp(pred) -1
    y = np.exp(y) -1

    if (a < 0).any():
        a = np.clip(a, 0, 1e20)
    rmsleTrain = np.sqrt(mean_squared_log_error(a,y))
    return rmsleTrain
def plotLGBM(i, logTransform, salesDomain, pred, y, base):
    if salesDomain:
        pred = np.reshape(pred, (pred.shape[0],1)) + np.reshape(base, (pred.shape[0],1))
        y = np.reshape(y, (pred.shape[0],1)) + np.reshape(base, (pred.shape[0],1))
    if logTransform:
        a = np.exp(pred[i:i+16])-1
        y = np.exp(y[i:i+16]) -1
        arima = np.exp(base[i:i+16]) - 1
    else:
        a = (pred[i:i+16])
        y = (y[i:i+16])
        arima = base[i:i+16]
    x = range(len(a))   

    fig, axs = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))
    axs.plot(x, y, color='blue',label='Original')
    axs.plot(x, a, color='red',label='pred')
    axs.plot(x, arima, color='orange',label='arima')
    axs.set_title('index: '+str(i))
    fig.subplots_adjust(hspace=0.5)


In [None]:
 # compare against base lgbm that just predicts always t+16

# Set parameters for LGBM model
params = {
    'boosting':'gbdt',#'gbdt', #'rf' #'dart'
    'objective': 'regression',  # Assuming you're doing regression
    'metric': 'mse',  # Mean squared error
    'num_leaves': 20,
    #'lambda_l1': 0.1,
    #'lambda_l2': 0.2,
    #'max_depth':10,
    #'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'force_col_wise':True,
}   

# Train the model
gbm = lgb.train(params, lgb.Dataset(X_train, label=y_train), 2000,valid_sets=[
    lgb.Dataset(X_test, label=y_test)
    #,lgb.Dataset(X_val, label=y_val)
    ]#,num_boost_round=100
,callbacks=[lgb.early_stopping(stopping_rounds=500)]
)  
predtrainLGBM = gbm.predict(X_train)
predtestLGBM = gbm.predict(X_test)
predvalLGBM = gbm.predict(X_pred)

#de-zscale
predtrainLGBM = (predtrainLGBM * train_bigDf.stdTarget) + train_bigDf.meanTarget
predtestLGBM = (predtestLGBM * test_bigDf.stdTarget) + test_bigDf.meanTarget
predvalLGBM = (predvalLGBM * val_bigDf.stdTarget) + val_bigDf.meanTarget

y_train = (y_train * train_bigDf.stdTarget) + train_bigDf.meanTarget
y_test = (y_test * test_bigDf.stdTarget) + test_bigDf.meanTarget

#print('arima errors: ', calcLossArima(train_subDf.ref, train_subDf.logSales, logTransform), calcLossArima(test_subDf.ref, test_subDf.logSales, logTransform))#, calcLossArima(val_subDf.ref, val_subDf.logSales, logTransform, baseTest))
print('arima errors: ', calcLossArima(train_bigDf.ref, train_bigDf.logSales, logTransform), calcLossArima(test_bigDf.ref, test_bigDf.logSales, logTransform))
#print('lgbm errors: ', calcLossLGBM(predtrainLGBM, y_train), calcLossLGBM(predtestLGBM, y_test))#, calcLossLGBM(predvalLGBM, y_val))

print('sales errors: ', mse(predtrainLGBM, y_train), mse(predtestLGBM, y_test))
print('sales errors: ', calcLossLGBMArima2(predtrainLGBM, y_train, baseTrain, predictDiff), calcLossLGBMArima2(predtestLGBM, y_test, baseTest,predictDiff))#, calcLossLGBMArima2(predvalLGBM, y_val,baseVal))
#print('sales errors: ', calcLossLGBMArima1(predtrainLGBM, train_subDf.sales, baseTrain), calcLossLGBMArima1(predtestLGBM, test_subDf.sales, baseTest), calcLossLGBMArima1(predvalLGBM, val_subDf.sales,baseVal))


In [None]:
y_train.isna().sum(),y_test.isna().sum()

In [None]:
predtrainLGBM = (predtrainLGBM * train_bigDf.stdTarget) + train_bigDf.meanTarget
predtestLGBM = (predtestLGBM * test_bigDf.stdTarget) + test_bigDf.meanTarget
predvalLGBM = (predvalLGBM * val_bigDf.stdTarget) + val_bigDf.meanTarget

y_train = (y_train * train_bigDf.stdTarget) + train_bigDf.meanTarget
y_test = (y_test * test_bigDf.stdTarget) + test_bigDf.meanTarget

#print('arima errors: ', calcLossArima(train_subDf.ref, train_subDf.logSales, logTransform), calcLossArima(test_subDf.ref, test_subDf.logSales, logTransform))#, calcLossArima(val_subDf.ref, val_subDf.logSales, logTransform, baseTest))
print('arima errors: ', calcLossArima(train_bigDf.ref, train_bigDf.logSales, logTransform), calcLossArima(test_bigDf.ref, test_bigDf.logSales, logTransform))
#print('lgbm errors: ', calcLossLGBM(predtrainLGBM, y_train), calcLossLGBM(predtestLGBM, y_test))#, calcLossLGBM(predvalLGBM, y_val))

print('sales errors: ', mse(predtrainLGBM, y_train), mse(predtestLGBM, y_test))
print('sales errors: ', calcLossLGBMArima2(predtrainLGBM, y_train, baseTrain, predictDiff), calcLossLGBMArima2(predtestLGBM, y_test, baseTest,predictDiff))#, calcLossLGBMArima2(predvalLGBM, y_val,baseVal))
#print('sales errors: ', calcLossLGBMArima1(predtrainLGBM, train_subDf.sales, baseTrain), calcLossLGBMArima1(predtestLGBM, test_subDf.sales, baseTest), calcLossLGBMArima1(predvalLGBM, val_subDf.sales,baseVal))


In [None]:
#sales errors:  0.37130709804425716 0.3891904372498804    2000/10
#sales errors:  0.40056093868328896 0.3921714758330452    2000/5
#sales errors:  0.3770976302581994 0.3904403544628816
#sales errors:  0.35511432092326384 0.387162551419242     2000/20  (1993)
#sales errors:  0.3618022418342807 0.3882029763030535     2000/30 (783) -> 0.42

print('sales errors: ', mse(predtrainLGBM, y_train), mse(predtestLGBM, y_test))#, calcLossLGBMArima2(predvalLGBM, y_val,baseVal))
#sales errors:  0.43143281396561584 0.41506468712898    500   / 5 max leaves
#sales errors:  0.41479865940305194 0.4113797188055782  1000
#sales errors:  0.400785079081174 0.4055445445983433    2000(1979)
#sales errors:  0.38301045799101135 0.4009700741045326  2000/10 (1299) -> 0.458 in pred set
#sales errors:  0.38265366510147164 0.4025353319230588  2000/15 (795)

In [None]:
#sales errors:  0.37130709804425716 0.3891904372498804    2000/10
#sales errors:  0.40056093868328896 0.3921714758330452    2000/5
sales errors:  0.3770976302581994 0.3904403544628816

print('sales errors: ', mse(predtrainLGBM, y_train), mse(predtestLGBM, y_test))#, calcLossLGBMArima2(predvalLGBM, y_val,baseVal))
#sales errors:  0.43143281396561584 0.41506468712898    500   / 5 max leaves
#sales errors:  0.41479865940305194 0.4113797188055782  1000
#sales errors:  0.400785079081174 0.4055445445983433    2000(1979)
#sales errors:  0.38301045799101135 0.4009700741045326  2000/10 (1299) -> 0.458 in pred set
#sales errors:  0.38265366510147164 0.4025353319230588  2000/15 (795)

In [None]:
((np.exp(predtrainLGBM)-1)<0).any(), ((np.exp(y_train)-1)<0).any()

In [None]:
np.exp(y_train)-1

In [None]:
predtrainLGBM

In [None]:
train_bigDf['LGBMpred'] = predtrainLGBM
test_bigDf['LGBMpred'] = predtestLGBM
val_bigDf['LGBMpred'] = predvalLGBM

In [None]:
val_bigDf['sales'] = np.exp(val_bigDf.LGBMpred+ val_bigDf.salesArima)-1

In [None]:
#sales errors:  0.3978436189821838 0.4313888853367467
print('sales errors: ', calcLossLGBMArima2(predtrainLGBM, y_train, baseTrain), calcLossLGBMArima2(predtestLGBM, y_test, baseTest))#, calcLossLGBMArima2(predvalLGBM, y_val,baseVal))

In [None]:
for i in range(1):
    salesDomaine = True
    logTransformLoc = True
    plotLGBM(i*16, logTransform, salesDomaine,predtrainLGBM, y_train, baseTrain)
    plotLGBM(i*16, logTransform, salesDomaine,predtestLGBM, y_test, baseTest)
    plotLGBM(i*16, logTransform, salesDomaine,predvalLGBM, y_val, baseVal)

In [None]:
importance_df = (
    pd.DataFrame({
        'feature_name': gbm.feature_name(),
        'importance_gain': gbm.feature_importance(importance_type='gain'),
        'importance_split': gbm.feature_importance(importance_type='split'),
    })
    .sort_values('importance_gain', ascending=False)
    .reset_index(drop=True)
)
pd.set_option('Display.max_rows', None)
importance_df

### cross validation

In [None]:
#sample_weights = np.reshape((train_subDf.onpromotion * train_subDf.onpromotion.std()) + train_subDf.onpromotion.mean()+ 1, (-1,1)) # öffset of 1 to not have 0 weight
sample_weights = np.reshape((train_bigDf.onpromotion * train_bigDf.onpromotion.std()) + train_bigDf.onpromotion.mean()+ 1, (-1,1))
sample_weights = sample_weights/ max(sample_weights)

nsplits=5
num_iter=500

kf = KFold(n_splits=nsplits, shuffle=False)# , random_state=42) #random doesn't help
splits = kf.split(X_train,y_train)

cv_results = lgb.cv(
    params,
    lgb.Dataset(X_train, label=y_train
    , weight=sample_weights
    ),
    num_boost_round=num_iter,
    folds=splits,
    stratified=False,  # Set to True for stratified sampling in classification
    #early_stopping_rounds=50,  # Stop if score doesn't improve for 50 rounds
    metrics=['mse'],  # Evaluation metrics to track
    seed=42,  # Set a seed for reproducibility
    return_cvbooster=True
    ,callbacks=[lgb.early_stopping(stopping_rounds=400)]
)

lenIter = len(cv_results['valid l2-mean'])
print(cv_results['valid l2-mean'][lenIter-1], cv_results['valid l2-stdv'][lenIter-1])
for i in range(nsplits):
    predtrainLGBM = cv_results['cvbooster'].boosters[i].predict(X_train)
    predtestLGBM = cv_results['cvbooster'].boosters[i].predict(X_test)
    #predvalLGBM = cv_results['cvbooster'].boosters[i].predict(X_val)

    print('sales errors: ', calcLossLGBMArima2(predtrainLGBM, y_train, baseTrain), calcLossLGBMArima2(predtestLGBM, y_test, baseTest))#, calcLossLGBMArima2(predvalLGBM, y_val,baseVal))


In [None]:
for i in range(nsplits):
    predtrainLGBM = cv_results['cvbooster'].boosters[i].predict(X_train)
    predtestLGBM = cv_results['cvbooster'].boosters[i].predict(X_test)

    print('sales errors: ', calcLossLGBMArima2(predtrainLGBM, y_train, baseTrain), calcLossLGBMArima2(predtestLGBM, y_test, baseTest))


# multisetp pred

In [None]:
X = modifiedDf[allF]
y = modifiedDf['target']
ref = modifiedDf['ref']
n_sequence = 16


X0 = np.lib.stride_tricks.sliding_window_view(X, (n_sequence, len(allF)))[:,0,:,:]
y0 = np.lib.stride_tricks.sliding_window_view(y, (n_sequence))#[:,0,:,:]
ref0 = np.lib.stride_tricks.sliding_window_view(ref, (n_sequence))#[:,0,:,:]

trainEnd = X_train.shape[0]
testEnd = X0.shape[0]-1
predEnd = X0.shape[0]

X_trainM= X0[0:trainEnd,:,:]
X_testM = X0[trainEnd:testEnd,:,:]
X_valM =  X0[testEnd:predEnd,:,:]
y_trainM= y0[0:trainEnd,:]
y_testM = y0[trainEnd:testEnd,:]
y_valM =  y0[testEnd:predEnd,:]
base_trainM= ref0[0:trainEnd,:]
base_testM = ref0[trainEnd:testEnd,:]
base_valM =  ref0[testEnd:predEnd,:]

### lgbm

In [None]:
# arima                   arima errors:  0.6413825470701051  0.5897267038139773  0.5635773590686141
# t-16 single prediction: sales errors:  0.5693091354085361  0.5790932252858579  0.5789745678057521
# model for every timestamp:            (0.4886254287678627, 0.5553120246259722, 0.5473653335392659)

In [None]:
def calcMultiLossLGBMArima2(pred, y, arima):
    logPred = np.reshape(pred, (pred.shape)) + np.reshape(arima, (pred.shape))
    a = np.exp(logPred) -1

    logy = np.reshape(y, (pred.shape)) + np.reshape(arima, (pred.shape))
    y = np.exp(logy) -1

    if (a < 0).any():
        a = np.clip(a, 0, 1e20)
    rmsleTrain = np.sqrt(mean_squared_log_error(a,y))
    return rmsleTrain

In [None]:
params = {
    'objective': 'regression',  # Assuming you're doing regression
    'metric': 'mse',  # Mean squared error
    'num_leaves': 5,
    'learning_rate': 0.04,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1
}   


X_train0 = np.reshape(X_trainM, (X_trainM.shape[0], -1))
X_test0 = np.reshape(X_testM, (X_testM.shape[0], -1)) 
X_val0 = np.reshape(X_valM, (X_valM.shape[0], -1)) 

# Train the model
num_round = 300  

gbms = [lgb.train(params, lgb.Dataset(X_train0, label=y_trainM[:,i]),num_round, valid_sets=[lgb.Dataset(X_test0, label=y_testM[:,i])]
    ,callbacks=[lgb.early_stopping(stopping_rounds=100)]
    ) for i in range(n_sequence)]

predtrainLGBM = [gbm.predict(X_train0, num_iteration=gbm.best_iteration) for i,gbm in enumerate(gbms)]
predtestLGBM = [gbm.predict(X_test0, num_iteration=gbm.best_iteration) for i,gbm in enumerate(gbms)]
predvalLGBM = [gbm.predict(X_val0, num_iteration=gbm.best_iteration) for i,gbm in enumerate(gbms)]
for i in range(16):
    print(i,'lgbm errors: ', calcLossLGBM(predtrainLGBM[i], y_trainM[:,i]), calcLossLGBM(predtestLGBM[i], y_testM[:,i]), calcLossLGBM(predvalLGBM[i], y_valM[:,i]))
for i in range(16):
    print(i,'sales errors: ', calcLossLGBMArima2(predtrainLGBM[i], y_trainM[:,i], base_trainM[:,i]), calcLossLGBMArima2(predtestLGBM[i], y_testM[:,i], base_testM[:,i]), calcLossLGBMArima2(predvalLGBM[i], y_valM[:,i],base_valM[:,i]))


In [None]:
calcMultiLossLGBMArima2(np.column_stack(predtrainLGBM), y_trainM, base_trainM),calcMultiLossLGBMArima2(np.column_stack(predtestLGBM), y_testM, base_testM),calcMultiLossLGBMArima2(np.column_stack(predvalLGBM), y_valM, base_valM) 

# create predictions

In [None]:
""" create training data based on lagged features not 2 sequences """
trainF = [
       #'store_nbr', 'family', 
       #'sales', 
       'onpromotion',# 'dataT',
       #'city', 'state', 'type', 'cluster', 
       'dcoilwtico', 
       'holidayType',
       'description', 
       'transferred', 
       #'transactions', 
       'store_closed']
timeF = [
       'linear_time', 'day_of_year', 'day_of_year_f12_0', 'day_of_year_f104_0','day_of_year_f24_0',  'day_of_year_f52_0',
       'day_of_year_f12_180', 'day_of_year_f104_180','day_of_year_f24_180','day_of_year_f52_180', 
       'weekday', 'month'
       ]

initial_lag = 16 # 16 = independent from previous predictions
lags = 10 #21
rolling = [7]

# Date string
date_string_test = "2017-07-01"

data0 = data#.loc[(data.date > "2015-07-01")]

predictions = []
log={}


for familyId in [0]:#[6]: #data.family.unique():
       # start with only some families

       print(familyId)
       familyDf = data0.loc[data0.family==familyId]  
       log1={}

       for storeId in data.store_nbr.unique():
            #print('store',storeId)
            storeDf = familyDf.loc[(familyDf.store_nbr == storeId)] 

            # ln tranformation
            storeDf.loc[:,['logSales']] = np.log(storeDf.sales + 1)

            f = 'logSales'#'transactions'# 'salesOrig' #target
            y_trainArima = storeDf.loc[storeDf.dataT == 'train'][[f,'date']]
            y_trainArima = y_trainArima.set_index('date')
            y_trainArima.index = pd.DatetimeIndex(y_trainArima.index).to_period('D')
            model = ARIMA(y_trainArima, order=(5, 1, 5))
            model_fit = model.fit()
            
            arimaSalesLogT = np.array(model_fit.fittedvalues)
            arimaSalesLogT = np.concatenate((arimaSalesLogT, model_fit.forecast(steps=(16))), axis=0)
            storeDf.loc[:,['salesArima']] = arimaSalesLogT
            storeDf.loc[:,['ref']] = storeDf['salesArima']
            storeDf.loc[:,['target']] = storeDf['logSales'] - storeDf['ref']

            # lag features / how many past datapoints are we tain
            featuresForLag = ['target']
            lagF = []#trainF
            for i in range(lags):
                   lag = i+1+initial_lag
                   newF = [featuresForLag[j] + '_lag' + str(lag) for j in range(len(featuresForLag))]
                   lagF = lagF + newF
                   storeDf.loc[:,newF] = storeDf[featuresForLag].shift(lag).to_numpy()
            
            # rolling features
            rollingF = []
            for rol in rolling:
                   for i in range(len(lagF)):
                          #if 'sales_t-16'  in lagF[i]:
                          if 'target'  in lagF[i]:
                                 fm = lagF[i]+'_rollingM' + str(rol)
                                 fs = lagF[i]+'_rollingS' + str(rol)
                                 rollingF.append(fm)
                                 rollingF.append(fs)
                                 storeDf.loc[:,[fm]] = storeDf[lagF[i]].rolling(rol).mean()#.copy()
                                 storeDf.loc[:,[fs]] = storeDf[lagF[i]].rolling(rol).std()#.copy()

            allF = lagF + rollingF + timeF + trainF

            # we get a matrix that predicts only 1 timestamp -> stride it
            storeDf = storeDf.iloc[lags+initial_lag+max(rolling)+1:storeDf.shape[0]]
            train_subDf = storeDf.loc[storeDf.date < date_string_test]
            test_subDf  = storeDf.loc[(storeDf.date >= date_string_test) & (storeDf.dataT =='train')]
            pred_subDf   = storeDf.loc[storeDf.dataT =='test']


            targetF = 'target'
            baseTrain = train_subDf[['ref']].to_numpy()
            baseTest  = test_subDf[['ref']].to_numpy()
            
            X_train = train_subDf[allF].to_numpy()
            y_train = train_subDf[[targetF]].to_numpy()
            X_test  =  test_subDf[allF].to_numpy()
            y_test  =  test_subDf[[targetF]].to_numpy()  


            baseSub = pred_subDf[['ref']].to_numpy()
            X_sub   = pred_subDf[allF].to_numpy()

            # Set parameters for LGBM model
            params = {
                'boosting':'gbdt',#'gbdt', #'rf' #'dart'
                'objective': 'regression',  # Assuming you're doing regression
                'metric': 'mse',  # Mean squared error
                'num_leaves': 15,
                'verbose': -1,
                'force_col_wise':True,
                'num_iterations':200
            }   
            
            # Train the model
            gbm = lgb.train(params, lgb.Dataset(X_train, label=y_train), valid_sets=[
                lgb.Dataset(X_test, label=y_test)
                #,lgb.Dataset(X_val, label=y_val)
                ]
            ,callbacks=[lgb.early_stopping(stopping_rounds=30)]
            )  
            predtrainLGBM = gbm.predict(X_train)
            predtestLGBM = gbm.predict(X_test)
            logTransform=True
            predictDiff=True
            
            lossTrain = calcLossLGBM(predtrainLGBM, y_train, logTransform, predictDiff, base=baseTrain)
            lossTest = calcLossLGBM(predtestLGBM, y_test, logTransform, predictDiff, baseTest)
            print('errors: ', lossTrain, lossTest)

            log2 = {}
            log2['testL'] = lossTest
            log2['trainL'] = lossTrain
            log1[storeId] = log2

            predvalLGBM = gbm.predict(X_sub)
            pred = np.reshape(predvalLGBM, baseSub.shape) + baseSub
            a = np.exp(pred)-1

            pred_subDf.loc[:,['sales']] = a
            predictions.append(pred_subDf[['id','sales']])

       print(log1)
       log[familyId] = log1

            



In [None]:
log1

In [None]:
predDf2 = pd.concat(predictions)

In [None]:
a = pd.read_csv('simpleArima_logT.csv')

In [None]:
a

In [None]:
predDf = predDf2.set_index('id')
b = a.set_index('id')

In [None]:
c = pd.merge(predDf, b, on='id', how='outer')

In [None]:
c['sales'] = c['sales_x'].fillna(c['sales_y'])

In [None]:
d = c.drop(['sales_x','sales_y'], axis =1)

In [None]:
d.to_csv('arima_prediction_and_updatedFam0WithLGBM_useFullData.csv')