In [145]:
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
from datetime import timedelta
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import TimeseriesGenerator
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import ConvLSTM2D

%matplotlib inline
import matplotlib.pyplot as plt

# Load DataFrames

In [146]:
oil = pd.read_csv("oil.csv")
holiday_events = pd.read_csv("holidays_events.csv")
stores = pd.read_csv("stores.csv")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
transactions = pd.read_csv("transactions.csv")

In [147]:
test.dtypes

id              int64
date           object
store_nbr       int64
family         object
onpromotion     int64
dtype: object

In [148]:
full_df = pd.concat([train, test], sort=True)

In [149]:
full_df = full_df.merge(stores, how = 'left', on = 'store_nbr')
full_df = full_df.merge(oil, how = 'left', on = 'date')
full_df = full_df.merge(transactions, how = 'left', on = ['date','store_nbr'])
full_df = full_df.merge(holiday_events, how = 'left', on = 'date')

In [150]:
full_df.date = pd.to_datetime(full_df.date)
full_df['dayofwk'] = full_df['date'].dt.day_name()
full_df['week'] = full_df['date'].dt.isocalendar().week
full_df['month'] = full_df['date'].dt.month
full_df['year'] = full_df['date'].dt.year
full_df = full_df.rename(columns={'type_x' : 'store_type','type_y':'holiday_type'})

# Dealing with Null Values

## Oil

In [162]:
full_df['dcoilwtico'] = full_df['dcoilwtico'].interpolate(limit_direction='both')   

In [163]:
full_df['dcoilwtico']

0          93.14
1          93.14
2          93.14
3          93.14
4          93.14
           ...  
3082855    47.26
3082856    47.26
3082857    47.26
3082858    47.26
3082859    47.26
Name: dcoilwtico, Length: 3082860, dtype: float64

## Transactions

In [152]:
full_df['transactions'] = full_df['transactions'].fillna(0)

## Holiday_events

In [153]:
full_df['holiday_type'] = full_df['holiday_type'].fillna('Work Day')
full_df['transferred'] = full_df['transferred'].fillna(False)

# Label Encoding Categorical Variables

In [9]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3082860 entries, 0 to 3082859
Data columns (total 21 columns):
 #   Column        Dtype         
---  ------        -----         
 0   date          datetime64[ns]
 1   family        object        
 2   id            int64         
 3   onpromotion   int64         
 4   sales         float64       
 5   store_nbr     int64         
 6   city          object        
 7   state         object        
 8   store_type    object        
 9   cluster       int64         
 10  dcoilwtico    float64       
 11  transactions  float64       
 12  holiday_type  object        
 13  locale        object        
 14  locale_name   object        
 15  description   object        
 16  transferred   bool          
 17  dayofwk       object        
 18  week          UInt32        
 19  month         int64         
 20  year          int64         
dtypes: UInt32(1), bool(1), datetime64[ns](1), float64(3), int64(6), object(9)
memory usage: 488.0+ MB


In [10]:
full_df

Unnamed: 0,date,family,id,onpromotion,sales,store_nbr,city,state,store_type,cluster,...,transactions,holiday_type,locale,locale_name,description,transferred,dayofwk,week,month,year
0,2013-01-01,AUTOMOTIVE,0,0,0.0,1,Quito,Pichincha,D,13,...,0.0,Holiday,National,Ecuador,Primer dia del ano,False,Tuesday,1,1,2013
1,2013-01-01,BABY CARE,1,0,0.0,1,Quito,Pichincha,D,13,...,0.0,Holiday,National,Ecuador,Primer dia del ano,False,Tuesday,1,1,2013
2,2013-01-01,BEAUTY,2,0,0.0,1,Quito,Pichincha,D,13,...,0.0,Holiday,National,Ecuador,Primer dia del ano,False,Tuesday,1,1,2013
3,2013-01-01,BEVERAGES,3,0,0.0,1,Quito,Pichincha,D,13,...,0.0,Holiday,National,Ecuador,Primer dia del ano,False,Tuesday,1,1,2013
4,2013-01-01,BOOKS,4,0,0.0,1,Quito,Pichincha,D,13,...,0.0,Holiday,National,Ecuador,Primer dia del ano,False,Tuesday,1,1,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3082855,2017-08-31,POULTRY,3029395,1,,9,Quito,Pichincha,B,6,...,0.0,Work Day,,,,False,Thursday,35,8,2017
3082856,2017-08-31,PREPARED FOODS,3029396,0,,9,Quito,Pichincha,B,6,...,0.0,Work Day,,,,False,Thursday,35,8,2017
3082857,2017-08-31,PRODUCE,3029397,1,,9,Quito,Pichincha,B,6,...,0.0,Work Day,,,,False,Thursday,35,8,2017
3082858,2017-08-31,SCHOOL AND OFFICE SUPPLIES,3029398,9,,9,Quito,Pichincha,B,6,...,0.0,Work Day,,,,False,Thursday,35,8,2017


In [154]:
# label encoding all object type variables
lbl_encoder = LabelEncoder()
for col in full_df.columns:
    if full_df[col].dtype == 'object':
        full_df[col] = lbl_encoder.fit_transform(full_df[col])

In [155]:
# one hot encode variables that are not large 

onehot_cols = ['store_type', 'holiday_type', 'dayofwk']

for col in onehot_cols:
    k = full_df[col].nunique()
    new_col = ['{}_{}'.format(col,k) for k in range(1,k+1)]
    tmp_df = pd.get_dummies(full_df[col]).set_axis(new_col, axis = 1, inplace = False)
    full_df = pd.concat([full_df,tmp_df], axis = 1)
    full_df = full_df.drop(columns = col)   

In [156]:
# Drop the id from the dataframe

full_df = full_df.drop(['id'], axis = 1)

In [161]:
train.groupby(['date']).agg({'sales':'mean', 'onpromotion':'mean'})

Unnamed: 0_level_0,sales,onpromotion
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-01,1.409438,0.000000
2013-01-02,278.390807,0.000000
2013-01-03,202.840197,0.000000
2013-01-04,198.911154,0.000000
2013-01-05,267.873244,0.000000
...,...,...
2017-08-11,463.733851,7.956790
2017-08-12,444.798280,4.664422
2017-08-13,485.768618,5.209315
2017-08-14,427.004717,4.513468


In [67]:
## group by date and store number, aggregate sales

sale_store_df = full_df[['date', 'sales', 'store_nbr']]
sale_store_df = sale_store_df.loc[sale_store_df['date'] < pd.to_datetime('2017/08/16', format = '%Y/%m/%d')]
sale_store_df1 = sale_store_df.groupby(['date', 'store_nbr']).sum()
sale_store_df1 = sale_store_df1.reset_index()

Unnamed: 0,date,sales,store_nbr
0,2013-01-01,0.000,1
1,2013-01-01,0.000,1
2,2013-01-01,0.000,1
3,2013-01-01,0.000,1
4,2013-01-01,0.000,1
...,...,...,...
3054343,2017-08-15,438.133,9
3054344,2017-08-15,154.553,9
3054345,2017-08-15,2419.729,9
3054346,2017-08-15,121.000,9


In [94]:
## Make store_nbr array of all the store numbers
store_nbr = sale_store_df1['store_nbr'].unique()

54

In [138]:
## Initialize empty time series dataframe
ts_data = []

In [136]:
sale_store_df1.loc[sale_store_df1['store_nbr']==1][['store_nbr','sales']].to_numpy().tolist()

[[1.0, 0.0],
 [1.0, 7417.147999999999],
 [1.0, 5873.244001],
 [1.0, 5919.879001],
 [1.0, 6318.78501],
 [1.0, 2199.0869999999995],
 [1.0, 6150.924],
 [1.0, 5597.181],
 [1.0, 6808.57899],
 [1.0, 4757.714999999999],
 [1.0, 5494.015979999999],
 [1.0, 5467.782980000001],
 [1.0, 2693.878004],
 [1.0, 5651.68501],
 [1.0, 5204.174],
 [1.0, 6733.596002],
 [1.0, 4942.473999999999],
 [1.0, 5923.124001],
 [1.0, 5895.146000000001],
 [1.0, 2371.133995],
 [1.0, 5227.602019999999],
 [1.0, 5298.986],
 [1.0, 6259.346],
 [1.0, 4861.524],
 [1.0, 5577.701],
 [1.0, 5290.32701],
 [1.0, 2439.3779999999997],
 [1.0, 4976.808000000001],
 [1.0, 4681.511000499999],
 [1.0, 6118.887],
 [1.0, 4712.576999999999],
 [1.0, 5782.779000000001],
 [1.0, 5706.9190100000005],
 [1.0, 2267.9829999999997],
 [1.0, 5574.040999999999],
 [1.0, 5080.786000000001],
 [1.0, 6107.432001],
 [1.0, 4452.242995],
 [1.0, 5593.812],
 [1.0, 4435.572],
 [1.0, 1199.1350000000002],
 [1.0, 1344.749998],
 [1.0, 1852.0150099999998],
 [1.0, 6121.725],
 

In [139]:
## each column is the store number and each row is the sales at each timestep
## from i = 0 to i = 53 different stores
for i in range(len(store_nbr)):
    ts_data.append(sale_store_df1.loc[sale_store_df1['store_nbr']==store_nbr[i]][['store_nbr','sales']].to_numpy().tolist())

In [131]:
ts_gen1 = TimeseriesGenerator()

IndexError: list index out of range

# Training

In [13]:
features = full_df.drop('sales', axis = 1).to_numpy().tolist()
target = full_df.sales.tolist()

n_features=full_df.shape[1]-1
n_steps = 3

# MultiOutput Data

We have to forecast multioutput data

In [14]:
ts_gen = TimeseriesGenerator(features, target, length = 6, sampling_rate = 1, batch_size=1)

In [15]:
multi_target = pd.concat([full_df['sales'],full_df['sales'].shift(-1), full_df['sales'].shift(-2)], axis = 1).to_numpy().tolist()

In [16]:
ts_generator = TimeseriesGenerator(features, multi_target, length = 6, sampling_rate = 1, batch_size = 1, stride = 6)

In [17]:
ts_generator[0]

(array([[[Timestamp('2013-01-01 00:00:00'), 0, 0, 0, 1, 18, 12, 13,
          93.14, 0.0, 1, 4, 50, False, 1, 1, 2013, 0, 0, 0, 1, 0, 0, 0,
          0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
         [Timestamp('2013-01-01 00:00:00'), 1, 1, 0, 1, 18, 12, 13,
          93.14, 0.0, 1, 4, 50, False, 1, 1, 2013, 0, 0, 0, 1, 0, 0, 0,
          0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
         [Timestamp('2013-01-01 00:00:00'), 2, 2, 0, 1, 18, 12, 13,
          93.14, 0.0, 1, 4, 50, False, 1, 1, 2013, 0, 0, 0, 1, 0, 0, 0,
          0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
         [Timestamp('2013-01-01 00:00:00'), 3, 3, 0, 1, 18, 12, 13,
          93.14, 0.0, 1, 4, 50, False, 1, 1, 2013, 0, 0, 0, 1, 0, 0, 0,
          0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
         [Timestamp('2013-01-01 00:00:00'), 4, 4, 0, 1, 18, 12, 13,
          93.14, 0.0, 1, 4, 50, False, 1, 1, 2013, 0, 0, 0, 1, 0, 0, 0,
          0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],
         [Timestamp('2013-01-01 00:00:00'), 5, 5, 0, 1, 18, 12, 13,
          93

# Split Train and Test Data

In [18]:
clean_test_df = full_df.loc[full_df['sales'].isnull()]
clean_train_df = full_df.loc[full_df['sales'] >= 0]

In [19]:
# 4 years of training for training split, 2 years for validation split

train_split = clean_train_df.loc[clean_train_df['date'] < pd.to_datetime('2017/01/01', format = '%Y/%m/%d')]
valid_split = clean_train_df.loc[clean_train_df['date'] >= pd.to_datetime('2017/01/01', format = '%Y/%m/%d')]

train_X, train_y = train_split.loc[:, train_split.columns != 'sales'], train_split[['sales', 'date']]
valid_X, valid_y = valid_split.loc[:, valid_split.columns != 'sales'], valid_split[['sales', 'date']]

In [20]:
train_X = train_X.set_index('date').to_numpy().tolist()
train_y = train_y.set_index('date').to_numpy().tolist()
valid_X = valid_X.set_index('date').to_numpy().tolist()
valid_y = valid_y.set_index('date').to_numpy().tolist()

In [21]:
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape = (train_X.shape[0], train_X.shape[1])))
model.add(Dense(1))
model.compile(optimizer='adam', loss = 'mse')

history = model.fit(train_X, train_y, epochs = 10, batch_size = 1, 
                    validation_data = (valid_X, valid_y), verbose = 2, shuffle = False)

AttributeError: 'list' object has no attribute 'shape'