# Unsupervised deep learning for time-series analysis.

# **Task**
To train a time-series forecasting model to predict the demand of various products across various stores using historical data. Run inference to forecast demand for all the products across all of the stores.

 General steps to train a time-series forecasting model in this project:


> Data Preparation.

> Feature engineering.

> Train-Test split.

> Model Selection.

> Model Training.
               
               *   Time-series prediction sequence modeling
               *   CNN-LSTM



> Model Evaluation.

> Inference and Forecasting.

> Visualization and Interpretation.




















 General steps to train a time-series forecasting model in this project:


> Data Preparation.

> Feature engineering.

> Train-Test split.

> Model Selection.

> Model Training.
               
               *   Time-series prediction sequence modeling
               *   CNN-LSTM



> Model Evaluation.

> Inference and Forecasting.

> Visualization and Interpretation.




















# Data preparation

In [None]:
#Load necessary Libraries.
import pathlib

np.random.seed(0)

import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras import optimizers
from keras.utils import plot_model
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Flatten
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

%matplotlib inline
warnings.filterwarnings("ignore")
init_notebook_mode(connected=True)

# Set seeds to make the experiment more reproducible.
#from tensorflow import set_random_seed
from numpy.random import seed
#set_random_seed(1)float, window=window, lag=lag)
seed(1)

In [None]:

# Load the dataset
np.random.seed(0)


def simulate_seasonal_term_frequency_domain(
        period,
        num_simulations,
        harmonics_sin=[1],
        harmonics_cos=[1],
        noise_std=5,
        ar=0.8,
        ma=0.5,
        trend=0.005,
        offset=10):
    """Generate data for testing.

    Use the formulation of a seasonal time series as an ARMA(1,1)
    process with 1 fourier seasonal components.

    Args:
        period (int): period of seasonality
        num_simulations (int): number of points to simulate
        harmonics_sin (list, optional): sin coefs. Defaults to [1].
        harmonics_cos (list, optional): cos coefs. Defaults to [1].
        noise_std (int, optional): noise level. Defaults to 5.
        ar (float, optional): ar(1) coeff. Defaults to 0.8.
        ma (float, optional): ma(1) coeff. Defaults to 0.5.
        trend (float, optional): linear trend coeff. Defaults to 0.005.
        offset (int, optional): constant base offset. Defaults to 10.

    Returns:
        np.array: Array of values following above process.
    """

    innovations = np.random.normal(0, noise_std, size=num_simulations)
    series = offset + np.zeros(num_simulations)
    for t in range(1, len(series)):
        y = 0
        for k in range(1, len(harmonics_sin) + 1):
            y += harmonics_sin[k - 1] * \
                np.sin((2 * np.pi * k * t / period) + np.pi/2)
            y += harmonics_cos[k - 1] * \
                np.cos((2 * np.pi * k * t / period) + np.pi/2)
        series[t] = ar * series[t - 1] + ma * \
            innovations[t-1] + y + innovations[t]

    # yt = c + at + ARMA(1,1) + fourier
    series = offset + trend * np.arange(0, len(series)) + series
    return np.ceil(series)


def main():

    # Model
    # Seasonality is fixed for all items
    # Store level:
    #   Store level variance, Store level baseline sales
    # Item level:
    #   Item level base sales, trend
    store_level_variance = [np.random.normal(4, 0.5) for _ in range(10)]
    store_level_baseline_offset = [
        np.round(np.random.normal(0, 8)) for _ in range(10)]
    item_level_baseline = [np.random.poisson(30) for _ in range(50)]
    item_level_trend = [np.random.normal(0.005, 0.003) for _ in range(50)]
    start_date = "2013-01-01"
    end_date = "2017-12-31"
    pred_start_date = "2018-01-01"
    pred_date = "2018-03-31"
    dates = pd.date_range(start=start_date, end=end_date)
    pred_dates = pd.date_range(start=pred_start_date, end=pred_date)

    new_data = []
    pred_data = []
    for store_idx in range(1, 10 + 1):
        for item_idx in range(1, 50 + 1):
            res = simulate_seasonal_term_frequency_domain(
                period=375,
                num_simulations=len(dates) + 180,
                harmonics_sin=list([1.2]),
                harmonics_cos=list([1.2]),
                noise_std=store_level_variance[store_idx - 1],
                ar=0.7,
                ma=0.1,
                trend=item_level_trend[item_idx - 1],
                offset=item_level_baseline[item_idx - 1] +
                store_level_baseline_offset[store_idx - 1]
            )
            new_data.append(pd.DataFrame({
                'date': dates,
                'store': [store_idx for _ in range(len(res[180:]))],
                'item': [item_idx for _ in range(len(res[180:]))],
                'sales': np.ceil(res[180:]).clip(min=0).astype(int)
            })
            )
            pred_data.append(pd.DataFrame({
                'date': pred_dates,
                'store': [store_idx for _ in range(len(pred_dates))],
                'item': [item_idx for _ in range(len(pred_dates))],
                'sales': [-1 for _ in range(len(pred_dates))]
            }))

    final_df = pd.concat(new_data)
    pred_df = pd.concat(pred_data)

    pathlib.Path("demand").mkdir(parents=True, exist_ok=True)
    final_df.to_csv("demand/train.csv", index=False)
    pred_df.to_csv("demand/test_full.csv", index=False)


if __name__ == "__main__":
    main()

In [None]:
# View the train dataset
train = pd.read_csv("demand/train.csv")
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,28
1,2013-01-02,1,1,30
2,2013-01-03,1,1,26
3,2013-01-04,1,1,19
4,2013-01-05,1,1,28


In [None]:
train.describe()

Unnamed: 0,store,item,sales
count,913000.0,913000.0,913000.0
mean,5.5,25.5,40.366946
std,2.872283,14.430878,11.154197
min,1.0,1.0,0.0
25%,3.0,13.0,33.0
50%,5.5,25.5,40.0
75%,8.0,38.0,48.0
max,10.0,50.0,91.0


In [None]:
#import matplotlib as plt
#train.plot(kind='hist')


In [None]:
# Test dataset
test = pd.read_csv("demand/test_full.csv")
test.head()


Unnamed: 0,date,store,item,sales
0,2018-01-01,1,1,-1
1,2018-01-02,1,1,-1
2,2018-01-03,1,1,-1
3,2018-01-04,1,1,-1
4,2018-01-05,1,1,-1


In [None]:
train.isnull().sum()

date     0
store    0
item     0
sales    0
dtype: int64

In [None]:
train.duplicated().sum()

0

In [None]:
#Check for datatypes
train.dtypes

date     object
store     int64
item      int64
sales     int64
dtype: object

# **Feature engineering.**

Data Normalization
- Min-Max scaling

In [None]:
# The time period of the train dataset
training1 = train[['date']].max()
training1

date    2017-12-31
dtype: object

In [None]:
# The time period of the train dataset
training2 = train[['date']].min()
training2

date    2013-01-01
dtype: object

Performed Data Normalization

The Min date from train set: 2013-01-01

The Max date from train set: 2017-12-31

In [None]:
# The time period of the test dataset
test1 = test[['date']].max()
test1

date    2018-03-31
dtype: object

In [None]:
test2 = test[['date']].min()
test2

date    2018-01-01
dtype: object

In [None]:
#find out what's the time gap between the last day from training set from the last day of the test set.
from datetime import datetime, timedelta
testmax= '2018/03/31'
trainmax= '2017/12/31'

d1 = datetime.strptime(testmax, "%Y/%m/%d")
d2 = datetime.strptime(trainmax, "%Y/%m/%d")

lag_size= d1-d2
from datetime import date
print(lag_size.days)

90


The max date from the test set: 2018-03-31

The min date from the train set: 2017-12-31

Forecast lag size is 90

VISUALIZATION

In [None]:
daily_sales = train.groupby('date', as_index=False)['sales'].sum()
store_daily_sales = train.groupby(['store', 'date'], as_index=False)['sales'].sum()
item_daily_sales = train.groupby(['item', 'date'], as_index=False)['sales'].sum()


In [None]:
daily_sales_sc = go.Scatter(x=daily_sales['date'], y=daily_sales['sales'])
layout = go.Layout(title='Daily sales', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
fig = go.Figure(data=[daily_sales_sc], layout=layout)
iplot(fig)

In [None]:
item_daily_sales_sc = []
for item in item_daily_sales['item'].unique():
    current_item_daily_sales = item_daily_sales[(item_daily_sales['item'] == item)]
    item_daily_sales_sc.append(go.Scatter(x=current_item_daily_sales['date'], y=current_item_daily_sales['sales'], name=('Item %s' % item)))

layout = go.Layout(title='Item daily sales', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
fig = go.Figure(data=item_daily_sales_sc, layout=layout)
iplot(fig)

In [None]:
store_daily_sales_sc = []
for store in store_daily_sales['store'].unique():
    current_store_daily_sales = store_daily_sales[(store_daily_sales['store'] == store)]
    store_daily_sales_sc.append(go.Scatter(x=current_store_daily_sales['date'], y=current_store_daily_sales['sales'], name=('Store %s' % store)))

layout = go.Layout(title='Store daily sales', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
fig = go.Figure(data=store_daily_sales_sc, layout=layout)
iplot(fig)

Tried to visualize the raw time series data to identify patterns, trends, seasonality, and potential anomalies.

In [None]:
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,28
1,2013-01-02,1,1,30
2,2013-01-03,1,1,26
3,2013-01-04,1,1,19
4,2013-01-05,1,1,28


# **TRAIN- TEST SPLIT**

In [None]:
#In order to reduce the training time we need to Sub-sample the train data set to get only the last year of data.
train= train[(train['date'] >= '2017-01-01')]


In [None]:
train_gp = train.sort_values('date').groupby(['item', 'store', 'date'], as_index=False)
train_gp = train_gp.agg({'sales':['mean']})
train_gp.columns = ['item', 'store', 'date', 'sales']
train_gp.head()

Unnamed: 0,item,store,date,sales
0,1,1,2013-01-01,28.0
1,1,1,2013-01-02,30.0
2,1,1,2013-01-03,26.0
3,1,1,2013-01-04,19.0
4,1,1,2013-01-05,28.0


In [None]:
#Transform the data into a time series problem
def series_to_supervised(data, window=1, lag=1, dropnan=True):
    cols, names = list(), list()
    # Input sequence (t-n, count[t - (n + 1)]... t-1)
    for i in range(window, 0, -1):
        cols.append(data.shift(i))
        names += [('%s(t-%d)' % (col, i)) for col in data.columns]
    # Current timestep (t=0)
    cols.append(data)
    names += [('%s(t)' % (col)) for col in data.columns]
    # Target timestep (t=lag)
    cols.append(data.shift(-lag))
    names += [('%s(t+%d)' % (col, lag)) for col in data.columns]
    # Put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # Drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

In [None]:
#We will use the current timestep and the last 29 to forecast 90 days ahead
from datetime import datetime
from datetime import date
window = 29
lag = lag_size
series = series_to_supervised(train.drop(['date'], axis=1).float, window=window, lag=lag)


AttributeError: ignored

In [None]:
# Label
labels_col = 'sales(t+%d)' % lag_size
labels = series[labels_col]
series = series.drop(labels_col, axis=1)

X_train, X_valid, Y_train, Y_valid = train_test_split(series, labels.values, test_size=0.4, random_state=0)
print('Train set shape', X_train.shape)
print('Validation set shape', X_valid.shape)
X_train.head()

TypeError: ignored