In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pip install pmdarima

## Step 1. Convert into Time Series Data
> Let's first grab the demand data from the pizza sales data in this directory

In [None]:
_ = pd.read_excel("../input/pizza-sales/Data Model - Pizza Sales.xlsx")

In [None]:
_ = _[['order_date', 'quantity']]

In [None]:
_.head()

In [None]:
_ = _.groupby(['order_date'])['quantity'].sum().reset_index()

In [None]:
_['order_date'] = pd.to_datetime(_['order_date'])

In [None]:
_.info()

In [None]:
_.describe()

>Our data is now good to go!

In [None]:
_.head()

## Step 2. Visualize Plot
> We are visual human beings, let's appreciate how our time series graph looks like...

In [None]:
plt.figure(figsize=(24,8))
fig = plt.plot(_.order_date, _.quantity)

> Looks like a doctor's handwriting!

## Step 3. Dickey-Fuller Test
> Stats time! This test is one way of assessing whether a time series is **stationary or not**.

> A time series is denoted as **stationary** if it has no trend or doesn't exhibit constant variance over time.

> The test assumes the following null and alternative hypotheses:

        - H0: The time series is non-stationary.
        - HA: The time series is stationary.

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
# syntax for Dickey-Fuller Test
adfuller(_['quantity'])

> The 2nd measure in the list is the most important to look at for it's the **p-value**.

> Since the p-value < .05, we reject the null hypothesis and conclude that the time series is **stationary.** That means less work for us.

> However, if the time series turned out not to be stationary, then there is an additional step that needs to be done. This step is known as **differencing,** 

## Step 4. Autocorrelation and Partial Autocorrelation
> Before you can fit an **ARIMA** model, you need a threesome of the following parameters

        - p: autoregressive: the number of autoregressive terms
        - d: difference: the number of nonseasonal differences to attain stationary.
        - q: moving average: number of lagged forecast errors in the prediction equation

In [None]:
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(_['quantity'])
plt.show()

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm

> To determine p value for our ARIMA model, we look at **Partial Autocorrelation Function (PACF) plot.**

In [None]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.tsa.plot_pacf(_['quantity'])

> To determine q value for our ARIMA model, we look at **Autocorrelation Function (ACF) plot.**

In [None]:
fig = plt.figure(figsize=(12,8))
fig = sm.graphics.tsa.plot_acf(_['quantity'])

How do we interpret this? 

> Highly recommend u to read this [How to Interpret ACF and PACF plots for Dummies like you :3](https://medium.com/@ooemma83/how-to-interpret-acf-and-pacf-plots-for-identifying-ar-ma-arma-or-arima-models-498717e815b6#:~:text=The%20basic%20guideline%20for%20interpreting,q%20for%20MA(q).)

> It's clearly shown in both ACF and PACF plots that they start to cut off at **1 (lag score).**

> It's generally advised to select a lower lag score.

> Since our data is already stationary from the get-go, we didn't perform any differencing, so our d remains 0.

> Our final ARIMA model will take the following parameters then **ARIMA(1,0,1).**

## Step 5. ARIMA time baby

In [None]:
from statsmodels.tsa.arima.model import ARIMA

In [None]:
model = ARIMA(_['quantity'], order=(1,0,1))
model_fit = model.fit()

In [None]:
model_fit.summary()

In [None]:
_['forecast'] = model_fit.predict()
_[['quantity','forecast']].plot(figsize=(12,8))

## Step 6. ARIMA, but more efficient...

> This works kind of like **hyperparameter tuning**. Essentially, you test every combinations of ARIMA parameters, then select the best parameters that yielded the best performing ARIMA model 

In [None]:
from pmdarima import auto_arima
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Run combinations of ARIMA(p,d,q)
# Set max p and max q at 7
run_tests = auto_arima(_['quantity'], max_p=7, d=0, max_q=7, trace=True, suppress_warnings=True)
run_tests.summary()

In [None]:
model = ARIMA(_['quantity'], order=(0,0,1))
model_fit = model.fit()

_['forecast'] = model_fit.predict()
_[['quantity','forecast']].plot(figsize=(12,8))

In [None]:
_['order_month'] = pd.DatetimeIndex(_['order_date']).month
monthly_sales = _.groupby(['order_month'])['quantity'].sum().reset_index()
# plt.figure(figsize=(12,8))
fig = plt.plot(monthly_sales.order_month, monthly_sales.quantity)

In [None]:
nan