In [14]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
os.getcwd()

#using For graph libraries
import chart_studio.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go

In [3]:
df = pd.read_csv('sales_train_v2.csv')
shop_59 = df[df['shop_id'].notnull() & (df['shop_id'] == 59)]
shop_59

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
40084,10.01.2013,0,59,22151,399.0,1.0
77502,04.01.2013,0,59,5603,699.0,1.0
77503,19.01.2013,0,59,5587,199.0,2.0
77504,31.01.2013,0,59,5613,5571.0,1.0
77505,10.01.2013,0,59,5623,699.0,1.0
77506,14.01.2013,0,59,5623,699.0,1.0
77507,10.01.2013,0,59,5629,2390.0,1.0
77508,04.01.2013,0,59,5643,2390.0,1.0
77509,17.01.2013,0,59,5643,2390.0,2.0


In [8]:
# Grouping by sales data by dates of ony shop 59;

df_sales = shop_59.groupby('date_block_num').item_cnt_day.sum().reset_index()
df_sales.head(100)

Unnamed: 0,date_block_num,item_cnt_day
0,0,2017.0
1,1,1897.0
2,2,2028.0
3,3,1388.0
4,4,1374.0
5,5,1707.0
6,6,1747.0
7,7,2048.0
8,8,2008.0
9,9,1751.0


In [17]:
#plot monthly sales
plot_data = [
    go.Scatter(
        x=df_sales['date_block_num'],
        y=df_sales['item_cnt_day'],
    )
]
plot_layout = go.Layout(
        title='Montly Sales for shop 59',
        xaxis_title="months",
        yaxis_title="sum of items sold that month",
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)

In [20]:
#create a new dataframe to model the difference
df_diff = df_sales.copy()

#add previous sales to the next row
df_diff['prev_sales'] = df_diff['item_cnt_day'].shift(1)

#drop the null values and calculate the difference
df_diff = df_diff.dropna()
df_diff['diff'] = (df_diff['item_cnt_day'] - df_diff['prev_sales'])
df_diff.head(100)

Unnamed: 0,date_block_num,item_cnt_day,prev_sales,diff
1,1,1897.0,2017.0,-120.0
2,2,2028.0,1897.0,131.0
3,3,1388.0,2028.0,-640.0
4,4,1374.0,1388.0,-14.0
5,5,1707.0,1374.0,333.0
6,6,1747.0,1707.0,40.0
7,7,2048.0,1747.0,301.0
8,8,2008.0,2048.0,-40.0
9,9,1751.0,2008.0,-257.0
10,10,1953.0,1751.0,202.0


In [38]:
#create dataframe for transformation from time series to supervised
df_supervised = df_diff.drop(['prev_sales'],axis=1)
#adding lags
for inc in range(1,15):
    field_name = 'lag_' + str(inc)
    df_supervised[field_name] = df_supervised['diff'].shift(inc)
    
df_supervised = df_supervised.reset_index(drop=True)
df_supervised

Unnamed: 0,date_block_num,item_cnt_day,diff,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12,lag_13,lag_14
0,1,1897.0,-120.0,,,,,,,,,,,,,,
1,2,2028.0,131.0,-120.0,,,,,,,,,,,,,
2,3,1388.0,-640.0,131.0,-120.0,,,,,,,,,,,,
3,4,1374.0,-14.0,-640.0,131.0,-120.0,,,,,,,,,,,
4,5,1707.0,333.0,-14.0,-640.0,131.0,-120.0,,,,,,,,,,
5,6,1747.0,40.0,333.0,-14.0,-640.0,131.0,-120.0,,,,,,,,,
6,7,2048.0,301.0,40.0,333.0,-14.0,-640.0,131.0,-120.0,,,,,,,,
7,8,2008.0,-40.0,301.0,40.0,333.0,-14.0,-640.0,131.0,-120.0,,,,,,,
8,9,1751.0,-257.0,-40.0,301.0,40.0,333.0,-14.0,-640.0,131.0,-120.0,,,,,,
9,10,1953.0,202.0,-257.0,-40.0,301.0,40.0,333.0,-14.0,-640.0,131.0,-120.0,,,,,


In [39]:
#drop null values
df_supervised = df_supervised.dropna().reset_index(drop=True)
df_supervised

Unnamed: 0,date_block_num,item_cnt_day,diff,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,lag_8,lag_9,lag_10,lag_11,lag_12,lag_13,lag_14
0,15,1228.0,-239.0,47.0,-71.0,-899.0,437.0,202.0,-257.0,-40.0,301.0,40.0,333.0,-14.0,-640.0,131.0,-120.0
1,16,1184.0,-44.0,-239.0,47.0,-71.0,-899.0,437.0,202.0,-257.0,-40.0,301.0,40.0,333.0,-14.0,-640.0,131.0
2,17,1157.0,-27.0,-44.0,-239.0,47.0,-71.0,-899.0,437.0,202.0,-257.0,-40.0,301.0,40.0,333.0,-14.0,-640.0
3,18,1082.0,-75.0,-27.0,-44.0,-239.0,47.0,-71.0,-899.0,437.0,202.0,-257.0,-40.0,301.0,40.0,333.0,-14.0
4,19,1244.0,162.0,-75.0,-27.0,-44.0,-239.0,47.0,-71.0,-899.0,437.0,202.0,-257.0,-40.0,301.0,40.0,333.0
5,20,1161.0,-83.0,162.0,-75.0,-27.0,-44.0,-239.0,47.0,-71.0,-899.0,437.0,202.0,-257.0,-40.0,301.0,40.0
6,21,1211.0,50.0,-83.0,162.0,-75.0,-27.0,-44.0,-239.0,47.0,-71.0,-899.0,437.0,202.0,-257.0,-40.0,301.0
7,22,1610.0,399.0,50.0,-83.0,162.0,-75.0,-27.0,-44.0,-239.0,47.0,-71.0,-899.0,437.0,202.0,-257.0,-40.0
8,23,1939.0,329.0,399.0,50.0,-83.0,162.0,-75.0,-27.0,-44.0,-239.0,47.0,-71.0,-899.0,437.0,202.0,-257.0
9,24,1293.0,-646.0,329.0,399.0,50.0,-83.0,162.0,-75.0,-27.0,-44.0,-239.0,47.0,-71.0,-899.0,437.0,202.0


In [40]:
# Import statsmodels.formula.api
import statsmodels.formula.api as smf
# Define the regression formula
model = smf.ols(formula='diff ~ lag_1 + lag_2 + lag_3 + lag_4 + lag_5 + lag_6 + lag_7 + lag_8 + lag_9 + lag_10 + lag_11 + lag_12 + lag_13 + lag_14  ', data=df_supervised)
# Fit the regression
model_fit = model.fit()
# Extract the adjusted r-squared
regression_adj_rsq = model_fit.rsquared_adj
print(regression_adj_rsq)

0.8155473462738888


In [44]:
#import MinMaxScaler and create a new dataframe for LSTM model
from sklearn.preprocessing import MinMaxScaler

df_model = df_supervised.drop(['date_block_num','item_cnt_day'],axis=1)

#split train and test set
train_set, test_set = df_model[0:-6].values, df_model[-6:].values

In [45]:
#apply Min Max Scaler
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(train_set)
# reshape training set
train_set = train_set.reshape(train_set.shape[0], train_set.shape[1])
train_set_scaled = scaler.transform(train_set)
# reshape test set
test_set = test_set.reshape(test_set.shape[0], test_set.shape[1])
test_set_scaled = scaler.transform(test_set)

In [46]:
X_train, y_train = train_set_scaled[:, 1:], train_set_scaled[:, 0:1]
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_test, y_test = test_set_scaled[:, 1:], test_set_scaled[:, 0:1]
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])

In [56]:
#import Keras
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.optimizers import Adam 
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.layers import LSTM
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from tensorflow.python.keras import datasets

model = Sequential()
model.add(LSTM(4, batch_input_shape=(1, X_train.shape[1], X_train.shape[2]), stateful=True))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train, y_train, nb_epoch=100, batch_size=1, verbose=1, shuffle=False)


The `nb_epoch` argument in `fit` has been renamed `epochs`.



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.callbacks.History at 0x1dd343d2780>

In [62]:
y_pred = model.predict(X_test,batch_size=1)
#for multistep prediction, you need to replace X_test values with the predictions coming from t-1

In [64]:
#reshape y_pred
y_pred = y_pred.reshape(y_pred.shape[0], 1, y_pred.shape[1])
#rebuild test set for inverse transform
pred_test_set = []
for index in range(0,len(y_pred)):
    print(np.concatenate([y_pred[index],X_test[index]],axis=1))
    pred_test_set.append(np.concatenate([y_pred[index],X_test[index]],axis=1))
#reshape pred_test_set
pred_test_set = np.array(pred_test_set)
pred_test_set = pred_test_set.reshape(pred_test_set.shape[0], pred_test_set.shape[2])
#inverse transform
pred_test_set_inverted = scaler.inverse_transform(pred_test_set)

[[ 0.29183453 -0.04114833  0.22296651  0.25115562 -0.62125749  0.83832335
   0.94311377  0.42065868  0.22155689  0.58832335  0.23353293  0.30538922
   0.27994012 -0.01197605  0.41616766]]
[[ 0.13027361 -0.0277512  -0.04114833  0.37442219  0.21556886 -0.62125749
   0.83832335  0.94311377  0.42065868  0.22155689  0.58832335  0.23353293
   0.30538922  0.27994012 -0.01197605]]
[[ 0.12693852  0.23062201 -0.0277512   0.16178737  0.33532934  0.21556886
  -0.62125749  0.83832335  0.94311377  0.42065868  0.22155689  0.58832335
   0.23353293  0.30538922  0.27994012]]
[[ 0.29525387  0.38755981  0.23062201  0.17257319  0.12874251  0.33532934
   0.21556886 -0.62125749  0.83832335  0.94311377  0.42065868  0.22155689
   0.58832335  0.23353293  0.30538922]]
[[-0.03978531  0.66124402  0.38755981  0.38058552  0.13922156  0.12874251
   0.33532934  0.21556886 -0.62125749  0.83832335  0.94311377  0.42065868
   0.22155689  0.58832335  0.23353293]]
[[ 0.35349643 -0.33779904  0.66124402  0.50693374  0.3413173

In [70]:
#create dataframe that shows the predicted sales
result_list = []
sales_dates = list(df_sales[-7:].date_block_num)
act_sales = list(df_sales[-7:].item_cnt_day)
for index in range(0,len(pred_test_set_inverted)):
    result_dict = {}
    result_dict['pred_value'] = int(pred_test_set_inverted[index][0] + act_sales[index])
    result_dict['date_block_num'] = sales_dates[index+1]
    result_list.append(result_dict)
df_result = pd.DataFrame(result_list)
#for multistep prediction, replace act_sales with the predicted sales
df_result

Unnamed: 0,date_block_num,pred_value
0,28,1082
1,29,860
2,30,855
3,31,1022
4,32,1069
5,33,975


In [75]:
#merge with actual sales dataframe
df_sales_pred = pd.merge(df_sales,df_result,on='date_block_num',how='left')
#plot actual and predicted
plot_data = [
    go.Scatter(
        x=df_sales_pred['date_block_num'],
        y=df_sales_pred['item_cnt_day'],
        name='actual'
    ),
        go.Scatter(
        x=df_sales_pred['date_block_num'],
        y=df_sales_pred['pred_value'],
        name='predicted'
    )
    
]
plot_layout = go.Layout(
        title='Sales Prediction'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)