# Goals
1. predict for every store individually
- make stationary target by diff, yes/no?
- z score normalization on train data
- predict next 16 values directly vs recursively?
2. predict store individually but with every pair/family as parameter
- needs zscore normalization
- stationary target yes/no?
3. predict all store/family pairs simultaneously
- zscore? maybe not needed
- stationary?

features:
1. time features:
- linear timestamp
- sin/cos of year, check for (week/month) if pattern present
- encoding of weekday, maybe also month
2. oil/holidays/location should be ok


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime
import plotly.offline as pyo
from plotly import subplots
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.tsa.api as smt

from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

import numpy as np
np.random.seed(42)

import tensorflow as tf
tf.random.set_seed(42)
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
#from tf.keras.layers import LSTM, Dense,Input,concatenate
from tensorflow.keras.layers import *
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam

from baseFunctions import *
from data_helpers import processData6

import lightgbm as lgb
import xgboost as xgb
import time

In [None]:
data, propDicts, flippedPropDicts = processData6()

### check transactions
-> where transaction has a nan value store is closed!
- test data has to be open all the time

### check why some products have a weird shape (produce)
- some correlation to on promotion, but nothing obvious
- let's ignore all data previous to july 2015

In [None]:
a = data.loc[(data.store_nbr == 2) & (data.family == 19)]
fig = subplots.make_subplots(rows=3, cols=1, shared_xaxes='all')
fig.add_trace(go.Scattergl(x=a.date, y=a.sales, name='sales'), col=1, row = 1)
fig.add_trace(go.Scattergl(x=a.date, y=a.onpromotion,name='promotion'), col=1, row = 3)
fig.add_trace(go.Scattergl(x=a.date, y=a.holidayType,name='holiday'), col=1, row = 2)
fig.add_trace(go.Scattergl(x=a.date, y=a.store_closed,name='closed'), col=1, row = 2)
fig.add_trace(go.Scattergl(x=a.date, y=a.dcoilwtico,name='oil'), col=1, row = 3)
fig.add_trace(go.Scattergl(x=a.date, y=a.transactions,name='transactions'), col=1, row = 3)
fig.show()

# feature engineering

aggregated data
- there is some linear trend 

In [None]:
dailyData = data.groupby('date')['sales'].sum()
dec = sm.tsa.seasonal_decompose(dailyData,period = 12, model = 'additive').plot()
plt.show()

In [None]:
plot_periodogram(dailyData, 365, n_domFreq=30)

# strong frequencies     => TimePeriod 
# 52 (weekly) 365/52     = 7 days
# 24 (biweekly) 365/24   = 15 days (half-month)
# 104 (halfweek) 365/104 = 3.5 = 3.5 days 
# 12 (monthly)  365/12   = 30 days
# 6 (bimonthly)          = 60 days
# 4 (quarters)           = 90 days
# 3 (thirds)             = 120 days
# 2 (half-year)          = 182
# 1 (yearly)             = 365 

In [None]:
data, timeFeatures = featureEngineering(data)

In [None]:
def plotidx(ind,pred, time_index):
    plt.figure(figsize=(10, 6))
    mintimeIdx = test_index[ind] - look_back
    plt.plot(time_index[mintimeIdx:test_index[ind]], X_test[0][ind,:,0], label='Train')
    plt.plot(time_index[test_index][ind:ind+n_predictedValues], pred[ind,:], label='Predicted')
    plt.plot(time_index[test_index][ind:ind+n_predictedValues], labels[test_index][ind,:], label='actual test')
    plt.xlabel('Date')
    plt.ylabel('Value')
    #plt.title(f'ARIMA Forecast (RMSLE: {rmsle:.4f})')
    plt.legend()
    plt.show()
for i in range(1,4):
    plotidx(i*10,forecast,train.date)

# data cleaning / outliers
- bunch of outliers
- product families are all very similar
- stores are all very similar as well (dropouts of stores are the same)
- sometimes stores are closed! (store 18) -> use transaction data?

In [None]:
# stores that opened later than 2013
for storeId in data1.store_nbr.unique():
    store = data1.loc[(data1.dataT == 'train') & (data1.store_nbr == storeId)]
    example = store.groupby('date')['sales'].sum()#.plot()
    b = example.cumsum()
    idx = b.loc[b>0].index[0]
    if str(idx) != '2013-01-02 00:00:00':
        print(storeId, idx)

In [None]:
# find profucts per store that weren't offered in the beginning
for storeId in data1.store_nbr.unique():
    for family in data1.family.unique():
        store = data1.loc[(data1.dataT == 'train') & (data1.store_nbr == storeId) & (data1.family == family)]
        example = store.groupby('date')['sales'].sum()#.plot()
        b = example.cumsum()
        if b.tail(1).iloc[0] == 0:
            print(storeId, family,flippedPropDicts['family'][family], 'NO product available')
        else:
            idx = b.loc[b>0].index[0]
            if str(idx) != '2013-01-02 00:00:00':
                print(idx,storeId, family, flippedPropDicts['family'][family],'date: ')

In [None]:
#visulize data that has dropouts -> we need to filter outliers!!
storeId = 1
for familyId in data1.family.unique():
        familyDf = data1.loc[(data1.dataT == 'train') & (data1.store_nbr == storeId) & (data1.family == familyId)].copy()
        nInvalidSequences = 0
        nValidS = 0
    
        look_back = 100
        n_predictedValues=16
        for i in range(familyDf.shape[0]-look_back-n_predictedValues):
            startS0 = i
            endS0 = startS0 + look_back
            endS1 = endS0 + n_predictedValues
            block1 = familyDf.loc[:,['sales','date']].iloc[startS0:endS1].copy()
            cumsum = block1.sales.cumsum()
            isSequenceValid = ((block1.sales == 0) & (cumsum.diff(20)==0)).sum() < 1 #newyear is included!
            if not isSequenceValid:
                nInvalidSequences = nInvalidSequences +1
            else:
                nValidS = nValidS +1
        stat, p = shapiro(familyDf.sales)
        print(familyId, stat,p, 'sample is gaussian:',p>0.05)
        if nInvalidSequences > 0 and familyId>17 and familyId != 1:
            print(familyId, nInvalidSequences, nValidS, flippedPropDicts['family'][familyId])
            
            familyDf['cumsum0'] = familyDf.sales.cumsum()
            familyDf.loc[:,'rolling7'] = familyDf.sales.rolling(14).mean()
            familyDf.loc[:,'rolling7std'] = familyDf.sales.rolling(14).std()
            familyDf.loc[:,'rollingThreshold'] = (familyDf['rolling7'] + 5* familyDf['rolling7std']).shift(1)

            familyDf.loc[:,'absMean'] = familyDf.loc[familyDf.cumsum0 > 0].sales.mean() + 5*familyDf.loc[familyDf.cumsum0 > 0].sales.std()
            fig = subplots.make_subplots(rows=1, cols=1, shared_xaxes='all')
            fig.add_trace(go.Scattergl(x=familyDf.date, y=familyDf.sales), col=1, row = 1)
            fig.add_trace(go.Scattergl(x=familyDf.date, y=familyDf.rollingThreshold), col=1, row = 1)
            fig.add_trace(go.Scattergl(x=familyDf.date, y=familyDf.absMean), col=1, row = 1)
            fig.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

from scipy.stats import shapiro

In [None]:
for id in [20]:
        familyDf = data.loc[(data.dataT == 'train') & (data.store_nbr == 1) & (data.family == id)].copy() #4 #16 #15
        # Assuming X is your data
        #kmeans = KMeans(n_clusters=2, random_state=0).fit(familyDf.sales)
        familyDf.loc[:,'rolling7'] = familyDf.sales.rolling(14).mean()
        familyDf.loc[:,'rolling7std'] = familyDf.sales.rolling(14).std()
        familyDf.loc[:,'rollingThreshold'] = (familyDf['rolling7'] + 5* familyDf['rolling7std']).shift(1)
        familyDf.loc[:,'absMean'] = familyDf.sales.mean() + 5*familyDf.sales.std()
        fig = subplots.make_subplots(rows=1, cols=1, shared_xaxes='all')
        fig.add_trace(go.Scattergl(x=familyDf.date, y=familyDf.sales), col=1, row = 1)
        fig.add_trace(go.Scattergl(x=familyDf.date, y=familyDf.rollingThreshold), col=1, row = 1)
        fig.add_trace(go.Scattergl(x=familyDf.date, y=familyDf.absMean), col=1, row = 1)
        fig.show()

### find out why some data series are so wild (produce)

### data cleaning / visualizing

In [None]:
def plotSales(df, storeId : int, family :str, familyId, p_value, save=False):
    plt.figure(figsize=(15,20))

    family=family.replace('/','-')
    
    fig, axs = plt.subplots(nrows=3, ncols=1, figsize=(8, 6))
    axs[0].plot(df.date, df.sales, color='blue',label='Original')
    axs[0].plot(df.date, df.sales_outRem, color='red',label='out_rem')
    axs[0].set_title(str(storeId)+'  '+str(familyId) + family + ' p_value:' + str(p_value))
    

    axs[1].plot(df.date, np.log(df.sales+1), color='blue')
    axs[1].set_title('log sales')
    axs[2].plot(df.date ,np.log(df.sales+1).diff(21), color='blue')
    axs[2].set_title('log sales, diff 3 weeks')

    fig.subplots_adjust(hspace=0.5)
    #orig = plt.plot(df.sales, color='blue',label='Original')
    #orig = plt.plot(df.sales_outRem, color='red',label='out_rem')

    if save:
        plt.savefig('graphs/plot_'+str(storeId)+'_'+str(familyId) + family+'.jpg')
    else:
        plt.show(block=False)

def filterDataForOutliers(data, familyId, storeId, flippedPropDicts, render = False, saveFig = False):
    a = data.loc[(data.store_nbr == storeId) & (data.family == familyId) & (data.dataT == 'train')].copy()

    #remove feb29
    #a = a.loc[~((a.date.dt.day==29) & (a.date.dt.month==2))]

    a.loc[:,'cumsum0'] = a.sales.cumsum()

    # filter out if product is not offered
    a = a.loc[a.cumsum0 > 0]

    # only consider data after july 2015 /other stuff seems to be too old
    #a = a.loc[a.date > "2015-07-01"]

    # check if stationary
    try:
        #p_value = test_stationarity(np.log(a.sales+1).diff(21), 12, True)
        p_value = test_stationarity(a.sales, 12, True)
    except:
        p_value = 1e6
    isStationary = p_value < 0.05
    
    # check if lots of 0s
    counts, bins = np.histogram(a.sales, bins=50)
    binZero = counts[0]
    binNextZero = -1
    for i,count in enumerate(counts):
        if i > 0 and count != 0:
            binNextZero = count
            break
    isZeroSinglePeak = binZero > 2*binNextZero

    countsSorted = np.sort(counts)[::-1]
    significantZeroPart = binZero > countsSorted[1]

    fishy = (significantZeroPart and isZeroSinglePeak)# or not isStationary

    sigInterval = 5 if fishy else 7

    # remove outliers  ---- seems to work ok-ish
    a.loc[:,'rolling7'] = a.sales.rolling(14).mean()
    a.loc[:,'rolling7std'] = a.sales.rolling(14).std()
    a.loc[:,'rollingThreshold'] = (a['rolling7'] + 5* a['rolling7std']).shift(1)
    a['absMean'] = a.sales.mean() + 5*a.sales.std()
    a['sales_outRem'] = a.sales

    if fishy:
        a.loc[(a.sales>2*a.absMean) & (a.sales>a.rollingThreshold) & (a.sales>20),'sales_outRem'] = np.nan
    else:
        a.loc[(a.sales>a.absMean) & (a.sales>a.rollingThreshold),'sales_outRem'] = np.nan

    hasOutliers = a.sales_outRem.isna().sum()
    a['sales_outRem'] = a.sales_outRem.interpolate(limit_direction='both')

    
    if fishy or render or hasOutliers:
        print(storeId, familyId, 'stationary ',isStationary, p_value, 'n_outliers: ',hasOutliers,flippedPropDicts['family'][familyId])
        plotSales(a, storeId, flippedPropDicts['family'][familyId], familyId, p_value, save=saveFig)
    return a

In [None]:
for storeId in data.store_nbr.unique():
    for familyId in data.family.unique():
        a = filterDataForOutliers(data, familyId, storeId, flippedPropDicts, render=True, saveFig = True)

### try augmented dickey-fuller test to find which targets have flaws

doesn't determine if we have dropouts, but gives good info anyways
-> only use data after july 2015, before seems too different
- sometimes even a new trend in the new year (2017)

Problems:
- lots of portions with 0s
- hard to determine which portions to drop
- not stationary
- some fat outliers

In [None]:
for id in [19]:#,24,28, 30,31]:
        familyDf = data.loc[(data.dataT == 'train') & (data.store_nbr == 1) & (data.family == id)].copy()
        familyDf = familyDf.set_index('date')
        dftest = adfuller(familyDf.sales, autolag='AIC')
        p_value = dftest[1]

In [None]:
# plot distributions -> check if gaussian distribution doesn't owrk well
for familyId in data.family.unique():
    familyDf = data.loc[(data.dataT == 'train') & (data.store_nbr == 1) & (data.family == familyId)]
    counts, bins, patches = plt.hist(familyDf.sales, bins=50, edgecolor='black')
    binZero = counts[0]
    countsSorted = np.sort(counts)[::-1]
    significantZeroPart = binZero > countsSorted[1] #checks if zero bin is one of the 2 biggest bins
        
    try:
        p_value = test_stationarity(familyDf.sales, 12, True)
    except:
        p_value = 1e6
    print(familyId, p_value, 'is stationary?: ', p_value < 0.05, ' has lots of 0:', significantZeroPart)
    #plt.show()

# individual prediction

some approaches work for large values but not for small and vice versa
predicting large & small values at the same time is hard

In [None]:
flippedPropDicts['family']

In [None]:
train = data1.loc[(data1.dataT == 'train') & (data1.store_nbr == 1) & (data1.family == 4)]# & (data1.date > idx)]
fig = subplots.make_subplots(rows=2, cols=1, shared_xaxes='all')
fig.add_trace(go.Scattergl(x=train.date, y=train.sales), col=1, row = 1)
fig.add_trace(go.Scattergl(x=train.date, y=train.weekday), col=1, row = 2)
fig.add_trace(go.Scattergl(x=train.date, y=train.holidayType), col=1, row = 2)

In [None]:
cv_scores = cross_val_score(model, X, y, cv=5)

# Print the cross-validation scores
print("Cross-validation scores:", cv_scores)

print("Mean CV score:", cv_scores.mean())
print("Standard deviation of CV scores:", cv_scores.std())

### LGBM

In [None]:
trainF = [
       #'store_nbr', 'family', 
       #'sales',
       'onpromotion', 
       #'dataT',
       #'city', 'state', 'type', 'cluster',
       'dcoilwtico', 
       'holidayType',
       'description', 'transferred', 
       'linear_time', 
       'day_of_year',
       'weekday',
       'month',

       #'day_of_year_f1_0', 
       # 'day_of_year_f1_60', 'day_of_year_f1_120',
       #'day_of_year_f1_180', 'day_of_year_f1_240', 'day_of_year_f1_300',
       #'day_of_year_f2_0', 
       # 'day_of_year_f2_60', 'day_of_year_f2_120',
       #'day_of_year_f2_180', 'day_of_year_f2_240', 'day_of_year_f2_300',
       #'day_of_year_f3_0',
       #  'day_of_year_f3_60', 'day_of_year_f3_120',
       #'day_of_year_f3_180', 'day_of_year_f3_240', 'day_of_year_f3_300',
       #'day_of_year_f4_0',# 'day_of_year_f4_60', 'day_of_year_f4_120',
       #'day_of_year_f4_180', 'day_of_year_f4_240', 'day_of_year_f4_300',
       #'day_of_year_f6_0', #'day_of_year_f6_60', 'day_of_year_f6_120',
       #'day_of_year_f6_180', 'day_of_year_f6_240', 'day_of_year_f6_300',
       'day_of_year_f12_0',
          #'day_of_year_f12_60',
       #'day_of_year_f12_120',
       #'day_of_year_f12_180',# 
       #'day_of_year_f12_240', #'day_of_year_f12_300',
       'day_of_year_f104_0',
        # 'day_of_year_f104_60',
       #'day_of_year_f104_120',
       #'day_of_year_f104_180',# 
       #'day_of_year_f104_240',# 'day_of_year_f104_300',
       'day_of_year_f24_0', 
       #'day_of_year_f24_60', 
       #'day_of_year_f24_120',
       #'day_of_year_f24_180', 
       #'day_of_year_f24_240',# 'day_of_year_f24_300',
       'day_of_year_f52_0',
       # 'day_of_year_f52_60', 
       #'day_of_year_f52_120',
       #'day_of_year_f52_180'#, 
       #'day_of_year_f52_240'#, 'day_of_year_f52_300'
       
       ]

train0 = trainF + ['sales']

train = data1.loc[(data1.dataT == 'train') & (data1.store_nbr == 1) & (data1.family == 18)] # family 18

n_predictedValues = 16
look_back = 100
zScoreNorm = False

sequence0 = []
sequence1 = []
labels = []

# zscore over all values -> not ideal bc test data
if zScoreNorm:
    mean = train.sales.mean()
    mean = 0 # modified zScore, not in mean = 0
    std = max(train.sales.std(), 1)
    train.loc[:,'sales'] = (train.sales - mean) / std

for i in range(train.shape[0]-look_back-n_predictedValues):
    startS0 = i
    endS0 = startS0 + look_back
    endS1 = endS0 + n_predictedValues
    block1 = train[['sales','date']].iloc[startS0:endS1]
    isSequenceValid = (block1.sales == 0).sum() < 2 #newyear is included!
    if isSequenceValid:
        sequence0.append(train[train0].iloc[startS0:endS0].to_numpy().flatten())
        sequence1.append(train[trainF].iloc[endS0:endS1].to_numpy().flatten())
        labels.append(train['sales'].iloc[endS0:endS1])
sequence0 = np.stack(sequence0, axis = 0)
sequence1 = np.stack(sequence1, axis=0)
labels    = np.stack(labels, axis = 0)



X = np.concatenate((sequence0, sequence1), axis=1)
y = labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

#train_data = lgb.Dataset(X_train, label=y_train)
#test_data = lgb.Dataset(X_test, label=y_test)

# Set parameters for LGBM model
params = {
    'objective': 'regression',  # Assuming you're doing regression
    'metric': 'rmsle',  # Mean squared error
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}


# Train the model
num_round = 10

gbms = [lgb.train(params, lgb.Dataset(X_train, label=y_train[:, i]),num_round, valid_sets=[lgb.Dataset(X_test, label=y_test[:,i])]) for i in range(y_train.shape[1])]


forecast = np.column_stack([gbm.predict(X_train, num_iteration=gbm.best_iteration) for gbm in gbms])
if (forecast<0).any():
    print('negative values!!!')
    forecast = np.clip(forecast, 0, 1e29)
if zScoreNorm:
    forecast = forecast *std  + mean
    y_train = y_train *std + mean
rmsleTrain = np.sqrt(mean_squared_log_error(forecast, y_train))
forecast = np.column_stack([gbm.predict(X_test, num_iteration=gbm.best_iteration) for gbm in gbms])
if zScoreNorm:
    forecast = forecast *std  + mean
    y_test = y_test*std + mean
rmsleTest = np.sqrt(mean_squared_log_error(forecast, y_test))
print('errors:  ', round(rmsleTrain,3), round(rmsleTest,3), y_train.shape[0], y_test.shape[0])

### LGBM per store/family

In [None]:
trainF = [
       #'store_nbr', 'family', 
       #'sales',
       'onpromotion', 
       #'dataT',
       #'city', 'state', 'type', 'cluster',
       'dcoilwtico', 
       'holidayType',
       'description', 'transferred', 
       'linear_time', 
       'day_of_year',
       'weekday',
       'month',

       #'day_of_year_f1_0', 
       # 'day_of_year_f1_60', 'day_of_year_f1_120',
       #'day_of_year_f1_180', 'day_of_year_f1_240', 'day_of_year_f1_300',
       #'day_of_year_f2_0', 
       # 'day_of_year_f2_60', 'day_of_year_f2_120',
       #'day_of_year_f2_180', 'day_of_year_f2_240', 'day_of_year_f2_300',
       #'day_of_year_f3_0',
       #  'day_of_year_f3_60', 'day_of_year_f3_120',
       #'day_of_year_f3_180', 'day_of_year_f3_240', 'day_of_year_f3_300',
       #'day_of_year_f4_0',# 'day_of_year_f4_60', 'day_of_year_f4_120',
       #'day_of_year_f4_180', 'day_of_year_f4_240', 'day_of_year_f4_300',
       #'day_of_year_f6_0', #'day_of_year_f6_60', 'day_of_year_f6_120',
       #'day_of_year_f6_180', 'day_of_year_f6_240', 'day_of_year_f6_300',
       'day_of_year_f12_0',
          #'day_of_year_f12_60',
       #'day_of_year_f12_120',
       #'day_of_year_f12_180',# 
       #'day_of_year_f12_240', #'day_of_year_f12_300',
       'day_of_year_f104_0',
        # 'day_of_year_f104_60',
       #'day_of_year_f104_120',
       #'day_of_year_f104_180',# 
       #'day_of_year_f104_240',# 'day_of_year_f104_300',
       'day_of_year_f24_0', 
       #'day_of_year_f24_60', 
       #'day_of_year_f24_120',
       #'day_of_year_f24_180', 
       #'day_of_year_f24_240',# 'day_of_year_f24_300',
       'day_of_year_f52_0',
       # 'day_of_year_f52_60', 
       #'day_of_year_f52_120',
       #'day_of_year_f52_180'#, 
       #'day_of_year_f52_240'#, 'day_of_year_f52_300'
       
       ]

train0 = trainF + ['sales']

n_predictedValues = 16
look_back = 100
zScoreNorm = False

errors = []
predictions = []

for storeId in data1.store_nbr.unique():
    for familyId in data1.family.unique():
        tic = time.time()
        familyDf = data1.loc[(data1.dataT == 'train') & (data1.store_nbr == storeId) & (data1.family == familyId)]
        compDf   = data1.loc[(data1.dataT == 'test') & (data1.store_nbr == storeId) & (data1.family == familyId)]
        toc1 = time.time()
        sequence0 = []
        sequence1 = []
        labels = []
        nInvalidSequences = 0
    
        # zscore over all values -> not ideal bc test data
        if zScoreNorm:
            mean = familyDf.sales.mean()
            mean = 0 # modified zScore, not in mean = 0
            std = max(familyDf.sales.std(), 1)
            familyDf.loc[:,'sales'] = (familyDf.sales - mean) / std
    
        for i in range(familyDf.shape[0]-look_back-n_predictedValues):
            startS0 = i
            endS0 = startS0 + look_back
            endS1 = endS0 + n_predictedValues
            block1 = familyDf[['sales','date']].iloc[startS0:endS1]
            cumsum = block1.sales.cumsum()
            isSequenceValid = ((block1.sales == 0) & (cumsum.diff(20)==0)).sum() < 1 #newyear is included!
            if isSequenceValid:
                sequence0.append(familyDf[train0].iloc[startS0:endS0].to_numpy().flatten())
                sequence1.append(familyDf[trainF].iloc[endS0:endS1].to_numpy().flatten())
                labels.append(familyDf['sales'].iloc[endS0:endS1])
            else:
                nInvalidSequences = nInvalidSequences +1

        if len(labels) == 0:
            print('no valid sequence', storeId, familyId)
            compDf.loc[:,'sales'] = 0
            predictions.append(compDf[['id','sales']])
            continue
        sequence0 = np.stack(sequence0, axis = 0)
        sequence1 = np.stack(sequence1, axis=0)
        labels    = np.stack(labels, axis = 0)
        toc2 = time.time()
    
        X = np.concatenate((sequence0, sequence1), axis=1)
        y = labels
        

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
    
        # Set parameters for LGBM model
        params = {
            'objective': 'regression',  # Assuming you're doing regression
            'metric': 'rmsle',  # Mean squared error
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.9,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1
        }
        toc3 = time.time()
    
        # Train the model
        num_round = 10

        gbms = [lgb.train(params, lgb.Dataset(X_train, label=y_train[:, i]),num_round, valid_sets=[lgb.Dataset(X_test, label=y_test[:,i])]) for i in range(y_train.shape[1])]
        toc4 = time.time()
    
        forecast = np.column_stack([gbm.predict(X_train, num_iteration=gbm.best_iteration) for gbm in gbms])
        if (forecast<0).any():
            print('negative values!!!')
            forecast = np.clip(forecast, 0, 1e29)
        if zScoreNorm:
            forecast = forecast *std  + mean
            y_train = y_train *std + mean
        rmsleTrain = np.sqrt(mean_squared_log_error(forecast, y_train))
        forecast = np.column_stack([gbm.predict(X_test, num_iteration=gbm.best_iteration) for gbm in gbms])
        if (forecast<0).any():
            print('negative values!!!')
            forecast = np.clip(forecast, 0, 1e29)
        if zScoreNorm:
            forecast = forecast *std  + mean
            y_test = y_test*std + mean
        rmsleTest = np.sqrt(mean_squared_log_error(forecast, y_test))
        
        print('errors:  ', round(rmsleTrain,3), round(rmsleTest,3), familyId, storeId,flippedPropDicts['family'][familyId])
        print(y_train.shape[0],y_test.shape[0],nInvalidSequences)
        toc5 = time.time()

        X_comp = np.concatenate((np.reshape(familyDf[train0].tail(look_back).to_numpy().flatten(), (1,-1)), np.reshape(compDf[trainF].to_numpy().flatten(), (1,-1))), axis=1)
        y_pred = np.column_stack([gbm.predict(X_comp, num_iteration=gbm.best_iteration) for gbm in gbms])
        if (y_pred<0).any():
            print('negative values!!!')
            y_pred = np.clip(y_pred, 0, 1e29)
        if zScoreNorm:
            compDf.loc[:,'sales'] = np.reshape((y_pred*std + mean), (16,))
        else:
            compDf.loc[:,'sales'] = np.reshape((y_pred), (16,))
        predictions.append(compDf[['id','sales']])
        toc6 = time.time()
        errors.append({'store':storeId,'fam':flippedPropDicts['family'][familyId],'trainE':round(rmsleTrain,3), 'testE':round(rmsleTest,3),'dt':toc6-tic})

In [None]:
dt = toc6-tic
def calcTimeDiff(tic,toc,dt):
    print((toc-tic)*100/dt)

calcTimeDiff(tic,toc1,dt)
calcTimeDiff(toc1,toc2,dt)
calcTimeDiff(toc2,toc3,dt)
calcTimeDiff(toc3,toc4,dt)
calcTimeDiff(toc4,toc5,dt)
calcTimeDiff(toc5,toc6,dt)

In [None]:
predDf = pd.concat(predictions)
predDf = predDf.sort_values(by='id').reset_index().drop('index',axis=1)
predDf = predDf.set_index('id')
predDf.to_csv("predictions_individual_lgbm.csv")

In [None]:
errorsDf = pd.DataFrame(errors)
errorsDf.to_csv('errors_predictions_individual_lgbm.csv')

In [None]:
# stores 52, 18 & 25 have issues, 
# families lawn and garden, liquour wine bear, school and office supplies
errorsDf.loc[(errorsDf.testE > 1) & (errorsDf.store == 52)]

In [None]:
errorsDf[['trainE','testE']].plot()

### LSTM approach
only works either for small OR for large values

In [None]:
trainF = [
       #'store_nbr', 'family', 
       #'sales',
       'onpromotion', 
       #'dataT',
       #'city', 'state', 'type', 'cluster',
       'dcoilwtico', 
       'holidayType',
       'description', 'transferred', 
       'linear_time', 
       'day_of_year',
       'weekday',
       'month',

       #'day_of_year_f1_0', 
       # 'day_of_year_f1_60', 'day_of_year_f1_120',
       #'day_of_year_f1_180', 'day_of_year_f1_240', 'day_of_year_f1_300',
       #'day_of_year_f2_0', 
       # 'day_of_year_f2_60', 'day_of_year_f2_120',
       #'day_of_year_f2_180', 'day_of_year_f2_240', 'day_of_year_f2_300',
       #'day_of_year_f3_0',
       #  'day_of_year_f3_60', 'day_of_year_f3_120',
       #'day_of_year_f3_180', 'day_of_year_f3_240', 'day_of_year_f3_300',
       #'day_of_year_f4_0',# 'day_of_year_f4_60', 'day_of_year_f4_120',
       #'day_of_year_f4_180', 'day_of_year_f4_240', 'day_of_year_f4_300',
       #'day_of_year_f6_0', #'day_of_year_f6_60', 'day_of_year_f6_120',
       #'day_of_year_f6_180', 'day_of_year_f6_240', 'day_of_year_f6_300',
       'day_of_year_f12_0',
          #'day_of_year_f12_60',
       #'day_of_year_f12_120',
       #'day_of_year_f12_180',# 
       #'day_of_year_f12_240', #'day_of_year_f12_300',
       'day_of_year_f104_0',
        # 'day_of_year_f104_60',
       #'day_of_year_f104_120',
       #'day_of_year_f104_180',# 
       #'day_of_year_f104_240',# 'day_of_year_f104_300',
       'day_of_year_f24_0', 
       #'day_of_year_f24_60', 
       #'day_of_year_f24_120',
       #'day_of_year_f24_180', 
       #'day_of_year_f24_240',# 'day_of_year_f24_300',
       'day_of_year_f52_0',
       # 'day_of_year_f52_60', 
       #'day_of_year_f52_120',
       #'day_of_year_f52_180'#, 
       #'day_of_year_f52_240'#, 'day_of_year_f52_300'
       
       ]

train0 = trainF + ['sales']

train = data1.loc[(data1.dataT == 'train') & (data1.store_nbr == 1) & (data1.family == 18)] # family 18

n_predictedValues = 16
look_back = 100
zScoreNorm = True

sequence0 = []
sequence1 = []
labels = []

# zscore over all values -> not ideal bc test data
if zScoreNorm:
    mean = train.sales.mean()
    mean = 0 # modified zScore, not in mean = 0
    std = train.sales.std()
    train['sales'] = (train.sales - mean) / std

for i in range(train.shape[0]-look_back-n_predictedValues):
    startS0 = i
    endS0 = startS0 + look_back
    endS1 = endS0 + n_predictedValues
    sequence0.append(train[train0].iloc[startS0:endS0])
    sequence1.append(train[trainF].iloc[endS0:endS1])
    labels.append(train['sales'].iloc[endS0:endS1])
sequence0 = np.stack(sequence0, axis = 0)
sequence1 = np.stack(sequence1, axis=0)
labels    = np.stack(labels, axis = 0)


try:
    tf.keras.utils.set_random_seed(42)
except:
    print('using new tf')

tf.random.set_seed(0)

n_features = len(train0)

input1 = Input(shape=(look_back, n_features))
input2 = Input(shape=(n_predictedValues, n_features-1))

lstm1 = LSTM(64, activation='relu', return_sequences=False)(input1)
lstm2 = LSTM(64, activation='relu', return_sequences=False)(input2)

#lstm1 = LSTM(64, activation='relu', return_sequences=False, kernel_regularizer=regularizers.l2(0.001))(lstm1)
#lstm2 = LSTM(64, activation='relu', return_sequences=False, kernel_regularizer=regularizers.l2(0.001))(lstm2)

#lstm2 = Dense(n_predictedValues, activation='relu')(lstm2)
x = tf.keras.layers.concatenate([lstm1, lstm2])
x = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01))(x)
if zScoreNorm:
    output = Dense(n_predictedValues, activation='relu')(x)
else:
    output = Dense(n_predictedValues, activation='relu')(x)

# Define the model
model = Model(inputs=[input1, input2], outputs=output)

optimizer = Adam(learning_rate=0.00001)
model.compile(optimizer='adam', loss='mae', metrics=[tf.keras.losses.MSLE]) 
#model.compile(optimizer='adam', loss=tf.keras.losses.MSLE, metrics=['mae']) 

n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)

X = [sequence0,sequence1]
y = labels

for train_index, test_index in tscv.split(sequence0):
    X_train = [sequence0[train_index],sequence1[train_index]]
    X_test  = [sequence0[test_index], sequence1[test_index]]
    y_train, y_test = labels[train_index], labels[test_index]

    model.fit(X_train, y_train, epochs=20, batch_size=32,validation_data=(X_test, y_test))


    forecast = model.predict(X_train, verbose=False)
    if zScoreNorm:
        forecast = forecast *std  + mean
        y_train = y_train *std + mean
    rmsleTrain = np.sqrt(mean_squared_log_error(forecast, y_train))
    forecast = model.predict(X_test, verbose=False)
    if zScoreNorm:
        forecast = forecast *std  + mean
        y_test = y_test*std + mean
    rmsleTest = np.sqrt(mean_squared_log_error(forecast, y_test))
    print('errors:  ', round(rmsleTrain,3), round(rmsleTest,3), y_train.shape[0], y_test.shape[0])

In [None]:
y_test, forecast
# --------------seed = 0 -------------------------------------------------------------------------------------------------------------
# --------------familyId = 3 (Beverages), store id = 1 -----------(train 1307 test 261)---------- 7.219 7.652 == all 0 ----------------
# --------------5 splits, 10 epochs per split, 32 batch size---------------------------------------------------------------------------
# errors:   0.856 0.740     without all the time featuers: 'dcoilwtico', 'holidayType','description', 'transferred', 'linear_time', 'day_of_year','weekday', 'month',
# errors:   7.219 7.652     with all the time featuers
# errors:   2.885 4.493     without oil a lot worse :o     'holidayType','description', 'transferred', 'linear_time', 'day_of_year','weekday', 'month',
# errors:   1.533 2.071     without descrip.transf         'dcoilwtico', 'holidayType', 'linear_time', 'day_of_year','weekday', 'month',
# errors:   1.262 1.156     without time features          'dcoilwtico', 'holidayType','description', 'transferred',
# errors:   7.219 7.653     without day of year & month    'dcoilwtico', 'holidayType','description', 'transferred', 'linear_time','weekday',
# errors:   7.219 7.653     without month                  'dcoilwtico', 'holidayType','description', 'transferred', 'linear_time', 'day_of_year','weekday',
# errors:   1.247 1.128     without lin time & day of y    'dcoilwtico', 'holidayType','description', 'transferred','weekday', 'month',
# errors:   7.219 7.653     without day of year            'dcoilwtico', 'holidayType','description', 'transferred', 'linear_time','weekday', 'month',

# testing fourier features, lock those features (always use) 'dcoilwtico', 'holidayType','description', 'transferred', 'linear_time', 'day_of_year','weekday', 'month',
# errors:   0.624 0.694     frequency: 12, 24, 52, 104 only 0 phase diff:       day_of_year_f12_0, day_of_year_f104_0, day_of_year_f24_0, day_of_year_f52_0
# errors:   3.162 3.661     frequency: 12, 24, 52, 104  6x 60° phase diff:
# errors ->nans  frequency: 12, 24, 52, 104 only 0 & 180°:
# errors:   0.705 0.731     frequency: 12, 24, 52, 104 only 0, 120, 240° phase diff:
# errors:   6.92  7.155     frequency: 1,2,3,4,6,12, 24, 52, 104 only 0 phase diff:
# errors:   7.211 7.595     frequency: 1,2,6,12, 24, 52, 104 only 0 phase diff:
# errors:   4.929 5.107     frequency: 6,12, 24, 52, 104 only 0 phase diff:
# errors ->nans  frequency: 4, 12, 24, 52, 104 only 0:
# errors:   2.155 1.544     frequency: 3,12, 24, 52, 104 only 0 phase diff:
# errors ->nans  frequency: 1, 12, 24, 52, 104 only 0:
# errors ->nans  frequency: 2, 12, 24, 52, 104 only 0:

# --------------familyId = 3 (Beverages), store id = 18 ----------(train 1307 test 261)---------- 3.873 5.123 == all 0 ----------------
# --------------5 splits, 10 epochs per split, 32 batch size---------------------------------------------------------------------------
#errors:   3.873 5.123      'dcoilwtico', 'holidayType','description', 'transferred', 'linear_time', 'day_of_year','weekday', 'month', frequency: 12, 24, 52, 104 only 0 phase diff
# -> zscaling doesn't help, nothing really works for this one
# 

In [None]:
(forecast != 0).any()

# predict in one big dataframe

In [None]:
grouped = (data1.loc[data1.dataT == 'train'].pivot(index='date', columns=['store_nbr', 'family']))#.transpose#()