<a href="https://colab.research.google.com/github/moumahan1990/retail_timeseries_forecasting/blob/master/Prophet_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import warnings
warnings.filterwarnings("ignore")

# loading packages
# basic + dates 
import numpy as np
import pandas as pd
from pandas import datetime

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns # advanced vizs
%matplotlib inline

# statistics
from statsmodels.distributions.empirical_distribution import ECDF

# time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# prophet by Facebook
from fbprophet import Prophet

In [0]:
# importing data
train = pd.read_csv("https://raw.githubusercontent.com/moumahan1990/retail_timeseries_forecasting/master/datasets/train.csv", 
                    parse_dates = True, low_memory = False, index_col = 'Date')
# additional store data
store = pd.read_csv("https://raw.githubusercontent.com/moumahan1990/retail_timeseries_forecasting/master/datasets/store.csv",low_memory = False)

In [0]:
train.head()

Unnamed: 0_level_0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-07-31,1,5,5263,555,1,1,0,1
2015-07-31,2,5,6064,625,1,1,0,1
2015-07-31,3,5,8314,821,1,1,0,1
2015-07-31,4,5,13995,1498,1,1,0,1
2015-07-31,5,5,4822,559,1,1,0,1


In [7]:
# Adding new features based on date
train['Year'] = train.index.year
train['Month'] = train.index.month
train['Day'] = train.index.day
train['WeekOfYear'] = train.index.weekofyear

# adding new variable
train['SalePerCustomer'] = train['Sales']/train['Customers']
train['SalePerCustomer'].describe()

count    844340.000000
mean          9.493619
std           2.197494
min           0.000000
25%           7.895563
50%           9.250000
75%          10.899729
max          64.957854
Name: SalePerCustomer, dtype: float64

In [0]:
train.head()

Unnamed: 0_level_0,Store,DayOfWeek,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Year,Month,Day,WeekOfYear,SalePerCustomer
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2015-07-31,1,5,5263,555,1,1,0,1,2015,7,31,31,9.482883
2015-07-31,2,5,6064,625,1,1,0,1,2015,7,31,31,9.7024
2015-07-31,3,5,8314,821,1,1,0,1,2015,7,31,31,10.126675
2015-07-31,4,5,13995,1498,1,1,0,1,2015,7,31,31,9.342457
2015-07-31,5,5,4822,559,1,1,0,1,2015,7,31,31,8.626118


In [66]:
# remove closed stores and those with no sales
df = train[(train["Open"] != 0) & (train['Sales'] != 0)]
df['Date'] = df.index
# sales for the store number 1 (StoreType C)
sales = df[df.Store == 1]
sales = sales[['Sales','Date']]
sales.head()

Unnamed: 0_level_0,Sales,Date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-07-31,5263,2015-07-31
2015-07-30,5020,2015-07-30
2015-07-29,4782,2015-07-29
2015-07-28,5011,2015-07-28
2015-07-27,6102,2015-07-27


In [67]:
# from the prophet documentation every variables should have specific names
sales = sales.rename(columns = {'Date': 'ds','Sales': 'y'})
sales.head()

Unnamed: 0_level_0,y,ds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-07-31,5263,2015-07-31
2015-07-30,5020,2015-07-30
2015-07-29,4782,2015-07-29
2015-07-28,5011,2015-07-28
2015-07-27,6102,2015-07-27


In [0]:
import plotly.express as px

In [68]:
fig = px.line(sales, x='ds', y='y')
fig.show()

In [69]:
# create holiday dataframe
state_dates = df[(df.StateHoliday == 'a') | (df.StateHoliday == 'b') & (df.StateHoliday == 'c')].loc[:, 'Date'].values
school_dates = df[df.SchoolHoliday == 1].loc[:, 'Date'].values

state = pd.DataFrame({'holiday': 'state_holiday',
                      'ds': pd.to_datetime(state_dates)})
school = pd.DataFrame({'holiday': 'school_holiday',
                      'ds': pd.to_datetime(school_dates)})

holidays = pd.concat((state, school))      
holidays.head()

Unnamed: 0,holiday,ds
0,state_holiday,2015-06-04
1,state_holiday,2015-06-04
2,state_holiday,2015-06-04
3,state_holiday,2015-06-04
4,state_holiday,2015-06-04


In [0]:
#Create train test split
sales_train= sales.head(730)
len(sales_train)
sales_train.head()
sales_test= sales.tail(50)

In [76]:
# Create model
my_model = Prophet(interval_width = 0.95, daily_seasonality=True, seasonality_prior_scale= 0.1,
                   holidays = holidays)
my_model.fit(sales_train)

<fbprophet.forecaster.Prophet at 0x7fc04fe5e208>

In [77]:
# predictions
# dataframe that extends into future 6 weeks 
# future_dates = my_model.make_future_dataframe(periods = 6*7)
future_dates = sales_test[['ds']]
forecast = my_model.predict(future_dates)
# preditions for last week
forecast[['yhat']]

Unnamed: 0,yhat
0,5454.57714
1,5264.065549
2,5402.083262
3,5667.096365
4,5610.406665
5,5058.489392
6,4865.48946
7,4748.776548
8,4963.074128
9,4917.30329


In [30]:
print("First week to forecast.")
future_dates.tail(7)

First week to forecast.


Unnamed: 0_level_0,ds
Date,Unnamed: 1_level_1
2013-01-09,2013-01-09
2013-01-08,2013-01-08
2013-01-07,2013-01-07
2013-01-05,2013-01-05
2013-01-04,2013-01-04
2013-01-03,2013-01-03
2013-01-02,2013-01-02


In [31]:
sales_test.tail(7)

Unnamed: 0_level_0,y,ds
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-09,5471,2013-01-09
2013-01-08,5580,2013-01-08
2013-01-07,7176,2013-01-07
2013-01-05,4997,2013-01-05
2013-01-04,4486,2013-01-04
2013-01-03,4327,2013-01-03
2013-01-02,5530,2013-01-02


In [0]:
result = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

In [34]:
result.head(7)

Unnamed: 0,ds,yhat,yhat_lower,yhat_upper
0,2013-01-02,5552.348454,3999.312975,7112.836301
1,2013-01-03,5361.560844,3661.598389,6973.482543
2,2013-01-04,5499.424113,3758.162922,7219.093019
3,2013-01-05,5757.717955,4254.726936,7419.845323
4,2013-01-07,5703.51752,4048.908065,7316.068682
5,2013-01-08,5150.768667,3496.32315,6746.479637
6,2013-01-09,4956.218307,3217.502698,6554.017263


In [44]:
(sales_test.y.values - result.yhat.values)

array([ -876.34845425,  -803.56084392, -1705.42411285, -1719.71795532,
        -466.51751976,  -150.7686675 ,   304.78169318,   821.48115925,
         333.98033618,  1398.44966495,  1049.12823472,   185.78255952,
        -254.32575585,  -186.77942579,  -692.02159239,  -680.58357137,
         112.27163434,   881.19887199,   811.49555768,  1501.35598931,
        1141.94475308,  1749.1854607 ,   550.7373799 ,   694.89330282,
         -89.89089949,  -497.82404229, -1610.86632721, -1246.06345225,
        -155.05156401,   343.19425551,   117.96438983,   602.88512432,
         527.65844032,   252.31077221,  -392.68952644,  -934.41002051,
        -852.79344558,  -791.99546012, -1126.00745504,  -270.42167247,
        -500.59149188,   -77.41815284,    77.77265609,   732.3457944 ,
         593.81293324,  2207.05488459,   -99.16239202,  -134.41282611,
        -166.06093313,  1097.46176919])

In [78]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
mean_absolute_error(sales_test.y, result.yhat)

691.4171035541901

In [79]:
from math import sqrt
rmse = sqrt(mean_squared_error(sales_test.y, result.yhat))
print(rmse)
print(mean_absolute_error(sales_test.y, result.yhat))

867.7962892900767
691.4171035541901


In [80]:
MAPE= np.mean(np.abs((sales_test.y.values - result.yhat.values) / sales_test.y)) * 100
print(MAPE)

13.90538645588595
