In [1]:
# imports
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import acf, pacf
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sktime.performance_metrics.forecasting import mean_absolute_error, mean_absolute_percentage_error
from pmdarima.preprocessing import FourierFeaturizer
from pmdarima import auto_arima

pd.options.plotting.backend = 'plotly'

# read data
df_store = pd.read_pickle('data/df_daily.pkl')
ts_company = df_store.groupby('date').sum()['sales']/1e6
ts_company.index.freq = 'D'

---
# Visual inspection
## Time plot
The dataset spans between August 2017 and January 2021, inclusively. Although more recent data is available, we decide to exclude them because since February 2021, Vietnam went through consecutive COVID lockdown periods during which, non-essential activities (including fashion retailing) had been largely inactive.

Insights from time plot:
* TREND: There is no obvious upward or downward trend in the dataset.
* SEASONALITY:
    * WEEKLY seasonality is very clear. Most peaks occur on Sundays. 
    * ANNUAL seasonality also exists:
        * The highest peaks take place at the end of November due to the major promotions of Black Friday every year.
        * The troughs occur on January/February due to Lunar New Year holidays. Total sales remains low in the 2 next months.

In [2]:
# time series plot
fig = ts_company.plot()

# Lunar New Year
fig.add_vrect(x0='2018-02-15', x1='2018-02-21', line_width=0, fillcolor="black", opacity=0.3)
fig.add_vrect(x0='2018-02-21', x1='2018-04-21', line_width=0, fillcolor="black", opacity=0.1) # 2 following months
fig.add_vrect(x0='2019-02-04', x1='2019-02-10', line_width=0, fillcolor="black", opacity=0.3)
fig.add_vrect(x0='2019-02-10', x1='2019-04-10', line_width=0, fillcolor="black", opacity=0.1) # 2 following months
fig.add_vrect(x0='2020-01-24', x1='2020-01-30', line_width=0, fillcolor="black", opacity=0.3)
fig.add_vrect(x0='2020-01-30', x1='2020-03-31', line_width=0, fillcolor="black", opacity=0.1) # 2 following months

# COVID
fig.add_vrect(x0='2020-04-01', x1='2020-04-16', line_width=0, fillcolor="red", opacity=0.3) # whole country
fig.add_vrect(x0='2020-07-28', x1='2020-09-04', line_width=0, fillcolor="red", opacity=0.3) # Da Nang

fig.update_layout(title_text='Company sales',
                  legend_title_text='Legend:',
                  width=1000)
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Sum of sales')

fig.update_layout(hovermode="x unified")

fig.show()

In [3]:
# differencing
ts_company_dif = ts_company.diff(periods=1).dropna()
ts_company_sdif = ts_company.diff(periods=7).dropna()

# differencing plots
fig = make_subplots(rows=3, cols=1, subplot_titles=['Original values','First differencing','First seasonal differencing'])

fig.add_trace(
    go.Scatter(x=ts_company.index, y=ts_company),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=ts_company.index, y=ts_company_dif),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(x=ts_company.index, y=ts_company_sdif),
    row=3, col=1
)

fig.update_layout(width=1000, height=700, showlegend=False)
fig.show()

## Decompose


In [4]:
stl_7 = STL(ts_company, period=7).fit()
stl_365 = STL(ts_company-stl_7.seasonal, period=365).fit()

fig = make_subplots(rows=4, cols=1, subplot_titles=['Trend','Weekly seasonality','Annual seasonality','Residuals'])
fig.add_trace(
    go.Scatter(x=stl_365.trend.index, y=stl_365.trend),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=stl_7.seasonal.index, y=stl_7.seasonal),
    row=2, col=1
)
fig.add_trace(
    go.Scatter(x=stl_365.seasonal.index, y=stl_365.seasonal),
    row=3, col=1
)
fig.add_trace(
    go.Scatter(x=stl_365.resid.index, y=stl_365.resid, mode='markers'),
    row=4, col=1
)
fig.update_layout(width=1000, height=800, title_text="STL decomposition", showlegend=False)
fig.show()

### Seasonality


In [5]:
ts = ts_company.loc['2019']
fig = px.line(ts,
              color=ts.index.isocalendar().week,
              width=1000, height=400,
              color_discrete_sequence=px.colors.sequential.Emrld
              )

fig.update_layout(title_text='Weekly seasonality',
                  legend_title_text='Week')
fig.update_xaxes(title='Date')
fig.update_yaxes(title='Sum of sales')

fig.show()

In [6]:
df = pd.DataFrame(ts_company.loc['2018':'2020'].resample('M').sum())
df['year'] = df.index.year
fig = px.line(df,
              x=df.index.month,
              y=df['sales'],
              color=df.year,
              width=1000, height=400,
              color_discrete_sequence=['light blue','green','orange'],
              )

fig.update_layout(title_text='Annual seasonality',
                  legend_title_text='Year')
fig.update_xaxes(title='Month')
fig.update_yaxes(title='Sum of sales')

fig.show()

## Autocorrelation

In [7]:
ACF_array = acf(ts_company, alpha=0.05, nlags=400)
ACF_lower = ACF_array[1][:,0] - ACF_array[0]
ACF_upper = ACF_array[1][:,1] - ACF_array[0]

fig = go.Figure()
[fig.add_scatter(x=(x,x), y=(0,ACF_array[0][x]), mode='lines',line_color='#3f3f3f') 
    for x in range(len(ACF_array[0]))]
fig.add_scatter(x=np.arange(len(ACF_array[0])), y=ACF_array[0], mode='markers', marker_color='#1f77b4',
                marker_size=5)
fig.add_scatter(x=np.arange(len(ACF_array[0])), y=ACF_upper, mode='lines', line_color='rgba(255,255,255,0)')
fig.add_scatter(x=np.arange(len(ACF_array[0])), y=ACF_lower, mode='lines', fillcolor='rgba(32, 146, 230,0.3)',
        fill='tonexty', line_color='rgba(255,255,255,0)')
fig.update_traces(showlegend=False)
fig.update_yaxes(zerolinecolor='#000000')
fig.update_layout(title='ACF', width=1000)
fig.show()

In [8]:
PACF_array = pacf(ts_company, alpha=0.05, nlags=400)
PACF_lower = PACF_array[1][:,0] - PACF_array[0]
PACF_upper = PACF_array[1][:,1] - PACF_array[0]

fig = go.Figure()
[fig.add_scatter(x=(x,x), y=(0,PACF_array[0][x]), mode='lines',line_color='#3f3f3f') 
    for x in range(len(PACF_array[0]))]
fig.add_scatter(x=np.arange(len(PACF_array[0])), y=PACF_array[0], mode='markers', marker_color='#1f77b4',
                marker_size=5)
fig.add_scatter(x=np.arange(len(PACF_array[0])), y=PACF_upper, mode='lines', line_color='rgba(255,255,255,0)')
fig.add_scatter(x=np.arange(len(PACF_array[0])), y=PACF_lower, mode='lines',fillcolor='rgba(32, 146, 230,0.3)',
        fill='tonexty', line_color='rgba(255,255,255,0)')
fig.update_traces(showlegend=False)
fig.update_yaxes(zerolinecolor='#000000')
fig.update_layout(title='PACF', width=1000)
fig.show()

---
# Descriptive statistics

In [29]:
df_lev = pd.read_pickle('data/df_lev.pkl').pivot('date','store_level','sales')/1e6
df_lev['A']


date
2017-08-07      1.527650
2017-08-08      0.000000
2017-08-09     34.942320
2017-08-10     20.151840
2017-08-11     24.305380
                 ...    
2021-01-27    348.163092
2021-01-28    365.668841
2021-01-29    455.651168
2021-01-30    668.920990
2021-01-31    968.229512
Name: A, Length: 1274, dtype: float64

In [37]:
# violin plots: company, groups, stores

fig = go.Figure()
fig.add_trace(go.Violin(y=ts_company, name='Company'))
fig.add_trace(go.Violin(y=df_lev['A'], name='Level A'))
fig.add_trace(go.Violin(y=df_lev['B'], name='Level B'))
fig.add_trace(go.Violin(y=df_lev['C'], name='Level C'))

fig.update_layout(yaxis_zeroline=False, height=600, title='Data distribution')
fig.show()

In [62]:
df_store_wide = df_store.loc[:,:'sales'].pivot('date','store_id','sales')/1e6
df_store_wide


store_id,307222,307244,307248,320264,328165,349920,349924,349952,349958,349962,...,461349,464495,471477,476061,480733,528854,536898,536902,566790,566792
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-07,1.527650,,,,,,,,,,...,,,,,,,,,,
2017-08-08,0.000000,,,,,,,,,,...,,,,,,,,,,
2017-08-09,26.627420,,,,8.314900,,,,,,...,,,,,,,,,,
2017-08-10,10.755720,2.358650,,,7.037470,,,,,,...,,,,,,,,,,
2017-08-11,10.216950,0.389000,,,13.699430,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-01-27,31.418629,10.709426,18.767801,14.902600,56.495801,46.394401,19.679010,17.582000,29.217695,14.1437,...,6.9210,3.539000,12.158002,5.9524,18.9484,29.114201,11.240401,10.6010,26.766327,1.1254
2021-01-28,27.555219,18.901200,21.299800,15.535000,34.744840,33.087529,19.292955,21.157200,31.648258,13.3234,...,4.1772,9.193451,9.812000,7.2276,35.7360,57.254400,8.649200,11.3546,34.114200,1.6668
2021-01-29,35.746445,17.948036,21.749000,13.488100,64.115900,36.954200,28.693600,25.557095,34.560800,18.5998,...,9.0310,2.045000,22.367600,0.3890,40.0356,47.493600,13.645000,12.2770,44.485787,3.3424
2021-01-30,52.543000,24.353200,31.193200,33.112934,125.637810,64.823900,23.483000,35.432800,50.211980,18.7886,...,11.5150,10.820000,26.827169,2.5674,39.5706,64.744200,18.044000,10.9660,62.963800,1.6222
