In [None]:
import tensorflow as tf
tf.random.set_seed(42)
import numpy as np
np.random.seed(42)
import time

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime
import plotly.offline as pyo
from plotly import subplots
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.tsa.api as smt

from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Model
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import LSTM, Dense,Input,concatenate
from tensorflow.keras.layers import *
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
import lightgbm as lgb

from datetime import datetime, timedelta


from baseFunctions import *
from data_helpers import processData6, featureEngineering, getSequencesFast, removeOutliers

In [None]:
data, propDicts, flippedPropDicts = processData6()

In [None]:
data, timeFeatures = featureEngineering(data)

# train 1 lgbm per family

In [None]:
trainF = [
       'store_nbr', 'family', 
       #'sales', 
       'onpromotion',# 'dataT',
       'city', 'state', 'type', 'cluster', 'dcoilwtico', 'holidayType',
       'description', 'transferred', 
       #'transactions', 
       'store_closed',
       'linear_time', 'day_of_year', 'day_of_year_f12_0', 'day_of_year_f104_0',
       'day_of_year_f24_0', 'day_of_year_f52_0', 'weekday', 'month'
       ]

trainF2 = trainF + ['sales']



n_predictedValues = 16
look_back = 42
zScoreNorm = True

# Date string
date_string = "2017-05-01"
date_object = datetime.strptime(date_string, '%Y-%m-%d')
days_ago = date_object - timedelta(days=(look_back + n_predictedValues -1 +2))
days_ago_string = days_ago.strftime('%Y-%m-%d')
days_ago_string

maskTrain = data.date < date_string
maskTest = data.date  >  days_ago_string #"2017-03-03" #42days + 15 day between (15 because we only want to iterate one value more from the test set)

log = {}

train = data.loc[data.dataT == 'train']

for familyId in data.family.unique():
    print(familyId)

    familyDf = train.loc[train.family==familyId]

    
    stdDict = {}
    meanDict= {}
    X_train = []
    X_test  = []
    y_train = []
    y_test  = []
    std_train, std_test = [], []
    mean_train, mean_test = [], []
    init= False
    for storeId in data.store_nbr.unique():
           storeDf = familyDf.loc[(familyDf.store_nbr == storeId)]

           X_train0,y_train0,mean,std = getSequencesFast(storeDf.loc[maskTrain], trainF, look_back, n_predictedValues, zScoreNorm=zScoreNorm)
           X_test0, y_test0           = getSequencesFast(storeDf.loc[maskTest], trainF, look_back, n_predictedValues, zScoreNorm=False, applyZScoreNorm=True, meanZ=mean, stdZ=std)

           stdDict[storeId] = std
           meanDict[storeId] = mean
           if init:
                  X_train = np.concatenate((X_train, X_train0), axis=0)
                  X_test  = np.concatenate((X_test,  X_test0), axis=0)
                  y_train = np.concatenate((y_train, y_train0), axis=0)
                  y_test  = np.concatenate((y_test,  y_test0), axis=0)
                  std_train = np.concatenate((std_train, np.ones(y_train0.shape)*std), axis=0)
                  std_test  = np.concatenate((std_test,  np.ones(y_test0.shape)*std), axis=0)
                  mean_train = np.concatenate((mean_train, np.ones(y_train0.shape)*mean), axis=0)
                  mean_test = np.concatenate((mean_test, np.ones(y_test0.shape)*mean), axis=0)
           else:
                  X_train, X_test, y_train,y_test = X_train0, X_test0, y_train0, y_test0
                  std_train  =np.ones(y_train0.shape)*std
                  std_test   =np.ones(y_test0.shape)*std
                  mean_train =np.ones(y_train0.shape)*mean
                  mean_test  =np.ones(y_test0.shape)*mean
                  init=True 


    # Set parameters for LGBM model
    params = {
        'objective': 'regression',  # Assuming you're doing regression
        'metric': 'rmsle',  # Mean squared error
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }   


    # Train the model
    num_round = 10  

    gbms = [lgb.train(params, lgb.Dataset(X_train, label=y_train[:, i]),num_round, valid_sets=[lgb.Dataset(X_test, label=y_test[:,i])]) for i in range(y_train.shape[1])]   

    forecast = np.column_stack([gbm.predict(X_train, num_iteration=gbm.best_iteration) for gbm in gbms])
    if (forecast<0).any():
        print('negative values!!!')
        forecast = np.clip(forecast, 0, 1e29)
    if zScoreNorm:
        forecast = forecast *std_train  + mean_train
        y_train = y_train *std_train + mean_train
    rmsleTrain = np.sqrt(mean_squared_log_error(forecast, y_train))
    forecast = np.column_stack([gbm.predict(X_test, num_iteration=gbm.best_iteration) for gbm in gbms])
    if zScoreNorm:
        forecast = forecast *std_test  + mean_test
        y_test = y_test*std_test + mean_test
    rmsleTest = np.sqrt(mean_squared_log_error(forecast, y_test))
    print('familyId:', familyId, 'errors:  ', round(rmsleTrain,3), round(rmsleTest,3), y_train.shape[0], y_test.shape[0])

    subD = {'trainE':round(rmsleTrain,3),'testE':round(rmsleTest,3),'gbms':gbms}
    log[familyId] = subD

In [None]:
import pickle

# Save the dictionary as a pickle file
with open('log_moedle_per_family_S42_16_normalized.pkl', 'wb') as f:
    pickle.dump(log, f)

In [None]:
# To load the dictionary back from the file
with open('log_moedle_per_family_S42_16_normalized.pkl', 'rb') as f:
    log = pickle.load(f)

# train 1 lbgm for everything

In [None]:
trainF = [
       'store_nbr', 'family', 
       #'sales', 
       'onpromotion',# 'dataT',
       'city', 'state', 'type', 'cluster', 'dcoilwtico', 'holidayType',
       'description', 'transferred', 
       #'transactions', 
       'store_closed',
       'linear_time', 'day_of_year', 'day_of_year_f12_0', 'day_of_year_f104_0',
       'day_of_year_f24_0', 'day_of_year_f52_0', 'weekday', 'month'
       ]

trainF2 = trainF + ['sales']



n_predictedValues = 16
look_back = 42
zScoreNorm = True

# Date string
date_string = "2017-05-01"
date_object = datetime.strptime(date_string, '%Y-%m-%d')
days_ago = date_object - timedelta(days=(look_back + n_predictedValues -1 +2))
days_ago_string = days_ago.strftime('%Y-%m-%d')
days_ago_string

maskTrain = data.date < date_string
maskTest = data.date  >  days_ago_string #"2017-03-03" #42days + 15 day between (15 because we only want to iterate one value more from the test set)

stdDict = {}
meanDict= {}
X_train = []
X_test  = []
y_train = []
y_test  = []
std_train, std_test = [], []
mean_train, mean_test = [], []
init= False

train = data.loc[data.dataT == 'train']

for familyId in data.family.unique():
    familyDf = train.loc[train.family==familyId]
    for storeId in data.store_nbr.unique():
           storeDf = familyDf.loc[(familyDf.store_nbr == storeId)]

           X_train0,y_train0,mean,std = getSequencesFast(storeDf.loc[maskTrain], trainF, look_back, n_predictedValues, zScoreNorm=zScoreNorm)
           X_test0, y_test0           = getSequencesFast(storeDf.loc[maskTest], trainF, look_back, n_predictedValues, zScoreNorm=False, applyZScoreNorm=True, meanZ=mean, stdZ=std)

           stdDict[storeId] = std
           meanDict[storeId] = mean
           if init:
                  X_train = np.concatenate((X_train, X_train0), axis=0)
                  X_test  = np.concatenate((X_test,  X_test0), axis=0)
                  y_train = np.concatenate((y_train, y_train0), axis=0)
                  y_test  = np.concatenate((y_test,  y_test0), axis=0)
                  std_train = np.concatenate((std_train, np.ones(y_train0.shape)*std), axis=0)
                  std_test  = np.concatenate((std_test,  np.ones(y_test0.shape)*std), axis=0)
                  mean_train = np.concatenate((mean_train, np.ones(y_train0.shape)*mean), axis=0)
                  mean_test = np.concatenate((mean_test, np.ones(y_test0.shape)*mean), axis=0)
           else:
                  X_train, X_test, y_train,y_test = X_train0, X_test0, y_train0, y_test0
                  std_train  =np.ones(y_train0.shape)*std
                  std_test   =np.ones(y_test0.shape)*std
                  mean_train =np.ones(y_train0.shape)*mean
                  mean_test  =np.ones(y_test0.shape)*mean
                  init=True 

print('processed data: done')

# Set parameters for LGBM model
params = {
    'objective': 'regression',  # Assuming you're doing regression
    'metric': 'rmsle',  # Mean squared error
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}


# Train the model
num_round = 10

gbms = [lgb.train(params, lgb.Dataset(X_train, label=y_train[:, i]),num_round, valid_sets=[lgb.Dataset(X_test, label=y_test[:,i])]) for i in range(y_train.shape[1])]

forecast = np.column_stack([gbm.predict(X_train, num_iteration=gbm.best_iteration) for gbm in gbms])
if (forecast<0).any():
    print('negative values!!!')
    forecast = np.clip(forecast, 0, 1e29)
if zScoreNorm:
    forecast = forecast *std_train  + mean_train
    y_train = y_train *std_train + mean_train
rmsleTrain = np.sqrt(mean_squared_log_error(forecast, y_train))
forecast = np.column_stack([gbm.predict(X_test, num_iteration=gbm.best_iteration) for gbm in gbms])
if zScoreNorm:
    forecast = forecast *std_test  + mean_test
    y_test = y_test*std_test + mean_test
rmsleTest = np.sqrt(mean_squared_log_error(forecast, y_test))
print('errors:  ', round(rmsleTrain,3), round(rmsleTest,3), y_train.shape[0], y_test.shape[0])

In [None]:
np.savez('train_test_Sequences_42_tp_16_normalized.npz', arr1=X_train, arr2=y_train, arr3=std_train, arr4=X_test, arr5=y_test, arr6=std_test)