In [1]:

# imports
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import acf, pacf
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from sktime.performance_metrics.forecasting import mean_absolute_error, mean_absolute_percentage_error
from pmdarima.preprocessing import FourierFeaturizer
from pmdarima import auto_arima

pd.options.plotting.backend = 'plotly'

import plotly.io as pio
pio.orca.config.timeout = 3600
pio.orca.config.default_scale = 4
pio.orca.config.default_width = 800

  pd.Int64Index,


In [2]:
# read data
df_store = pd.read_pickle('data/df_daily.pkl')
df_store['sales'] = df_store['sales']/1e6
ts_company = df_store.groupby('date').sum()['sales']

ts_company.index.freq = 'D'

---
# Visual inspection
## Time plot
The dataset spans between August 2017 and January 2021, inclusively. Although more recent data is available, we decide to exclude them because since February 2021, Vietnam went through consecutive COVID lockdown periods during which, non-essential activities (including fashion retailing) had been largely inactive.

Insights from time plot:
* TREND: There is no obvious upward or downward trend in the dataset.
* SEASONALITY:
    * WEEKLY seasonality is very clear. Most peaks occur on Sundays. 
    * ANNUAL seasonality also exists:
        * The highest peaks take place at the end of November due to the major promotions of Black Friday every year.
        * The troughs occur on January/February due to Lunar New Year holidays. Total sales remains low in the 2 next months.

In [3]:
df_store.head()

Unnamed: 0,date,store_id,sales,promo_count,store_level,store_group,store_format,store_segment,opening_date,status,store_area,number_of_staff,province,channel
0,2017-08-07,307222,1.52765,1,A,Trực thuộc,Phố,Mainstream,2011-01-11,Active,112.0,6.0,Hồ Chí Minh,Retail
1,2017-08-08,307222,0.0,0,A,Trực thuộc,Phố,Mainstream,2011-01-11,Active,112.0,6.0,Hồ Chí Minh,Retail
2,2017-08-09,307222,26.62742,14,A,Trực thuộc,Phố,Mainstream,2011-01-11,Active,112.0,6.0,Hồ Chí Minh,Retail
3,2017-08-10,307222,10.75572,9,A,Trực thuộc,Phố,Mainstream,2011-01-11,Active,112.0,6.0,Hồ Chí Minh,Retail
4,2017-08-11,307222,10.21695,10,A,Trực thuộc,Phố,Mainstream,2011-01-11,Active,112.0,6.0,Hồ Chí Minh,Retail


In [32]:
df = df_store.iloc[:,:3].set_index('date')
df['store_id'] = df['store_id'].astype('str')

fig = px.box(df, 
             x="store_id", 
             y="sales",
             )

fig.update_layout(title_text='Sales of all stores',
                  width=800)

fig.show()

fig.write_image(file="results/plots/fig1.png")

In [36]:
df.query("store_id=='307222'")['sales'].plot()

In [37]:
px.histogram(df.query("store_id=='307222'")['sales'])

In [38]:
px.histogram(df.query("store_id=='307222'")['sales'].diff())