In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras import optimizers
from keras.utils import plot_model
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Flatten
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from IPython.display import Image

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
train = pd.read_csv('D:\\dataset\\Deeplearninforecast\\train.csv', parse_dates=['date'])

In [3]:
train['date'].min(), train['date'].max()

(Timestamp('2013-01-01 00:00:00'), Timestamp('2017-12-31 00:00:00'))

In [4]:
train_gp = train.sort_values('date').groupby(['item', 'store', 'date'], as_index=False)
train_gp = train_gp.agg({'sales':['mean']})
train_gp.columns = ['item', 'store', 'date', 'sales']
train_gp.head()

Unnamed: 0,item,store,date,sales
0,1,1,2013-01-01,13
1,1,1,2013-01-02,11
2,1,1,2013-01-03,14
3,1,1,2013-01-04,13
4,1,1,2013-01-05,10


Transform the data into a time series problem

In [5]:
def series_to_supervised(data, window=1, lag=1, dropnan=True):
    
    cols, names = list(), list()
    
    # Input sequence (t-n, ... t-1)
    for i in range(window, 0, -1):
        cols.append(data.shift(i))
        names += [('%s(t-%d)' % (col, i)) for col in data.columns]
    # Current timestep (t=0)
    cols.append(data)
    names += [('%s(t)' % (col)) for col in data.columns]
    
    
    # Target timestep (t=lag)
    cols.append(data.shift(-lag))
    names += [('%s(t+%d)' % (col, lag)) for col in data.columns]
    
    
    # Put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    
    # Drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

We will use the current timestep and the last 29 to forecast 10 days ahead

In [6]:
from PIL import Image
import requests
from io import BytesIO

https://drive.google.com/file/d/1w-delBp9zaLkxyv4yb1IgryexI10JNXn/view?usp=sharing

In [7]:
window = 29
lag = 10
series = series_to_supervised(train_gp.drop('date', axis=1), window=window, lag=lag)
series.head()

Unnamed: 0,item(t-29),store(t-29),sales(t-29),item(t-28),store(t-28),sales(t-28),item(t-27),store(t-27),sales(t-27),item(t-26),...,sales(t-2),item(t-1),store(t-1),sales(t-1),item(t),store(t),sales(t),item(t+10),store(t+10),sales(t+10)
29,1.0,1.0,13.0,1.0,1.0,11.0,1.0,1.0,14.0,1.0,...,11.0,1.0,1.0,6.0,1,1,9,1.0,1.0,14.0
30,1.0,1.0,11.0,1.0,1.0,14.0,1.0,1.0,13.0,1.0,...,6.0,1.0,1.0,9.0,1,1,13,1.0,1.0,11.0
31,1.0,1.0,14.0,1.0,1.0,13.0,1.0,1.0,10.0,1.0,...,9.0,1.0,1.0,13.0,1,1,11,1.0,1.0,16.0
32,1.0,1.0,13.0,1.0,1.0,10.0,1.0,1.0,12.0,1.0,...,13.0,1.0,1.0,11.0,1,1,21,1.0,1.0,11.0
33,1.0,1.0,10.0,1.0,1.0,12.0,1.0,1.0,10.0,1.0,...,11.0,1.0,1.0,21.0,1,1,15,1.0,1.0,14.0


Drop rows with different item or store values than the shifted columns


In [8]:
last_item = 'item(t-%d)' % window
last_store = 'store(t-%d)' % window
series = series[(series['store(t)'] == series[last_store])]
series = series[(series['item(t)'] == series[last_item])]

https://drive.google.com/file/d/1OdA46aEiumk6qqcCMejmPmNnDDuyMwfZ/view?usp=sharing

In [9]:
series

Unnamed: 0,item(t-29),store(t-29),sales(t-29),item(t-28),store(t-28),sales(t-28),item(t-27),store(t-27),sales(t-27),item(t-26),...,sales(t-2),item(t-1),store(t-1),sales(t-1),item(t),store(t),sales(t),item(t+10),store(t+10),sales(t+10)
29,1.0,1.0,13.0,1.0,1.0,11.0,1.0,1.0,14.0,1.0,...,11.0,1.0,1.0,6.0,1,1,9,1.0,1.0,14.0
30,1.0,1.0,11.0,1.0,1.0,14.0,1.0,1.0,13.0,1.0,...,6.0,1.0,1.0,9.0,1,1,13,1.0,1.0,11.0
31,1.0,1.0,14.0,1.0,1.0,13.0,1.0,1.0,10.0,1.0,...,9.0,1.0,1.0,13.0,1,1,11,1.0,1.0,16.0
32,1.0,1.0,13.0,1.0,1.0,10.0,1.0,1.0,12.0,1.0,...,13.0,1.0,1.0,11.0,1,1,21,1.0,1.0,11.0
33,1.0,1.0,10.0,1.0,1.0,12.0,1.0,1.0,10.0,1.0,...,11.0,1.0,1.0,21.0,1,1,15,1.0,1.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
912985,50.0,10.0,99.0,50.0,10.0,88.0,50.0,10.0,62.0,50.0,...,72.0,50.0,10.0,52.0,50,10,86,50.0,10.0,63.0
912986,50.0,10.0,88.0,50.0,10.0,62.0,50.0,10.0,81.0,50.0,...,52.0,50.0,10.0,86.0,50,10,53,50.0,10.0,59.0
912987,50.0,10.0,62.0,50.0,10.0,81.0,50.0,10.0,81.0,50.0,...,86.0,50.0,10.0,53.0,50,10,54,50.0,10.0,74.0
912988,50.0,10.0,81.0,50.0,10.0,81.0,50.0,10.0,87.0,50.0,...,53.0,50.0,10.0,54.0,50,10,51,50.0,10.0,62.0


Remove unwanted columns(item & store)

In [10]:
columns_to_drop = [('%s(t+%d)' % (col, lag)) for col in ['item', 'store']]
for i in range(window, 0, -1):
    columns_to_drop += [('%s(t-%d)' % (col, i)) for col in ['item', 'store']]
series.drop(columns_to_drop, axis=1, inplace=True)
series.drop(['item(t)', 'store(t)'], axis=1, inplace=True)

Data split

In [11]:
# Label
labels_col = 'sales(t+%d)' % lag
labels = series[labels_col]
series = series.drop(labels_col, axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(series, labels.values, test_size=0.3, random_state=0)
print('Train set shape', X_train.shape)
print('Test set shape', X_test.shape)
X_train.head()

Train set shape (628943, 30)
Test set shape (269547, 30)


Unnamed: 0,sales(t-29),sales(t-28),sales(t-27),sales(t-26),sales(t-25),sales(t-24),sales(t-23),sales(t-22),sales(t-21),sales(t-20),...,sales(t-9),sales(t-8),sales(t-7),sales(t-6),sales(t-5),sales(t-4),sales(t-3),sales(t-2),sales(t-1),sales(t)
710568,52.0,62.0,47.0,48.0,60.0,32.0,57.0,49.0,48.0,56.0,...,24.0,46.0,46.0,47.0,51.0,50.0,34.0,40.0,38.0,50
222494,93.0,102.0,112.0,114.0,137.0,107.0,86.0,112.0,99.0,119.0,...,89.0,98.0,120.0,109.0,116.0,151.0,134.0,89.0,96.0,116
677599,131.0,73.0,71.0,97.0,106.0,112.0,123.0,123.0,73.0,77.0,...,114.0,104.0,86.0,88.0,91.0,92.0,109.0,113.0,132.0,81
624357,124.0,144.0,132.0,90.0,126.0,108.0,153.0,116.0,117.0,164.0,...,115.0,126.0,126.0,121.0,98.0,96.0,93.0,114.0,126.0,129
681390,78.0,77.0,56.0,69.0,62.0,76.0,62.0,76.0,87.0,40.0,...,71.0,82.0,88.0,66.0,65.0,66.0,58.0,78.0,66.0,76
