# Accounting for seasonality

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose, STL
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_process import ArmaProcess
from statsmodels.graphics.gofplots import qqplot
from statsmodels.tsa.stattools import adfuller
from tqdm.notebook import tqdm
from itertools import product
from typing import Union

## Examining the SARIMA(p,d,q)(P,D,Q)m model
- SARIMA = Seasonal ARIMA = Seasonal Autoregressive integrative moving average model

**The parameter m stands for the frequency.**
- spodaj tabela m parametrov (v SARIMA funkciji je označeno kot parameter s) glede na observacije in velikosti podatkov. Vrednost pove koliko naših vzorcev tvori periodo.

                     Minute           Hour       Day       Week         Year
    Monthly                                                              12 -> primer v nadaljevanju
    Daily                                                    7           365
    Hourly                                        24        168          8766
    Every minute                       60        1440      10080        525960
    Every second        60            3600       86400     604800       31557600

In [None]:
df = pd.read_csv('data/air-passengers.csv')
df.head()

In [None]:
fig, ax = plt.subplots()

ax.plot(df['Month'], df['Passengers'])
ax.set_xlabel('Date')
ax.set_ylabel('Number of air passengers')

# podatki so beleženi vsak mesec -> m=12
plt.xticks(np.arange(0, 145, 12), np.arange(1949, 1962, 1))

fig.autofmt_xdate()
plt.tight_layout()

In [None]:
fig, ax = plt.subplots()

ax.plot(df['Month'], df['Passengers'], markevery=np.arange(6, 145, 12), marker='o')
ax.set_xlabel('Date')
ax.set_ylabel('Number of air passengers')

plt.xticks(np.arange(0, 145, 12), np.arange(1949, 1962, 1))

fig.autofmt_xdate()
plt.tight_layout()

## Identifying seasonal patterns in a time series

**Usually, plotting the time series data is enough to observe periodic patterns.**

In [None]:
fig, ax = plt.subplots()

ax.plot(df['Month'], df['Passengers'])
for i in np.arange(0, 145, 12):
    ax.axvline(x=i, linestyle='--', color='black', linewidth=1)
ax.set_xlabel('Date')
ax.set_ylabel('Number of air passengers')

plt.xticks(np.arange(0, 145, 12), np.arange(1949, 1962, 1))

fig.autofmt_xdate()
plt.tight_layout()

**Time series decomposition (dekompozicija časovne vrste) is
a statistical task that separates the time series into its three main components: a trend
component, a seasonal component, and the residuals.**

- **The trend component** represents the long-term change in the time series. This
component is responsible for time series that increase or decrease over time.
- **The seasonal component** is, of course, the seasonal pattern in the time series. It represents
repeated fluctuations that occur over a fixed period of time.
- **The residuals**, or the noise, express any irregularity that cannot be explained by the trend or the seasonal
component.

In [None]:
decomposition = STL(df['Passengers'], period=12).fit() # STL => funkcija iz statsmodel

fig, (ax1, ax2, ax3, ax4) = plt.subplots(nrows=4, ncols=1, sharex=True, figsize=(10,8))

ax1.plot(decomposition.observed)
ax1.set_ylabel('Observed')

ax2.plot(decomposition.trend)
ax2.set_ylabel('Trend')

ax3.plot(decomposition.seasonal)
ax3.set_ylabel('Seasonal')

ax4.plot(decomposition.resid)
ax4.set_ylabel('Residuals')

plt.xticks(np.arange(0, 145, 12), np.arange(1949, 1962, 1))

fig.autofmt_xdate()
plt.tight_layout()

# če podatki ne bi vsebovali sezonskosti - bi Seasonal line graf bil raven

## Forecasting the number of monthly air passengers

Novo - poleg parametra d imamo tudi parameter D (sezonski)

<img src="images/tsf_08.png">

### Forecasting with an ARIMA(p,d,q) model
- poglejmo kako tipično sezonske podatke napove ARIMA model

In [None]:
ad_fuller_result = adfuller(df['Passengers'])

print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')

In [None]:
df_diff = np.diff(df['Passengers'], n=1)

ad_fuller_result = adfuller(df_diff)

print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')

In [None]:
df_diff_seasonal_diff = np.diff(df_diff, n=12)

ad_fuller_result = adfuller(df_diff_seasonal_diff)

print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')

In [None]:
fig, ax = plt.subplots()

ax.plot(df['Month'], df['Passengers'])
ax.set_xlabel('Date')
ax.set_ylabel('Number of air passengers)')
ax.axvspan(132, 143, color='#808080', alpha=0.2)

plt.xticks(np.arange(0, 145, 12), np.arange(1949, 1962, 1))

fig.autofmt_xdate()
plt.tight_layout()

In [None]:
def optimize_SARIMA(endog: Union[pd.Series, list], order_list: list, d: int, D: int, s: int) -> pd.DataFrame:
    
    results = []
    
    for order in tqdm(order_list):
        try: 
            model = SARIMAX(
                endog, 
                order=(order[0], d, order[1]),
                seasonal_order=(order[2], D, order[3], s),
                simple_differencing=False).fit(disp=False)
        except:
            continue
            
        aic = model.aic
        results.append([order, aic])
        
    result_df = pd.DataFrame(results)
    result_df.columns = ['(p,q,P,Q)', 'AIC']
    
    #Sort in ascending order, lower AIC is better
    result_df = result_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)
    
    return result_df

In [None]:
import warnings
warnings.filterwarnings('ignore')

ps = range(0, 13, 1)
qs = range(0, 13, 1)
Ps = [0] # postavimo na 0, testiramo samo ARIMA model
Qs = [0] # postavimo na 0, testiramo samo ARIMA model

d = 2
D = 0
s = 12

ARIMA_order_list = list(product(ps, qs, Ps, Qs))

train = df['Passengers'][:-12]

ARIMA_result_df = optimize_SARIMA(train, ARIMA_order_list, d, D, s)
ARIMA_result_df

Glede na zgornje rezultate: ARIMA(11,2,3)

In [None]:
ARIMA_model = SARIMAX(train, order=(11,2,3), simple_differencing=False)
ARIMA_model_fit = ARIMA_model.fit(disp=False)
ARIMA_model_fit.plot_diagnostics(figsize=(10,8));

In [None]:
from statsmodels.stats.diagnostic import acorr_ljungbox

residuals = ARIMA_model_fit.resid
res = acorr_ljungbox(residuals, np.arange(1, 11, 1))
print(list(res["lb_pvalue"]))

In [None]:
test = df.iloc[-12:]

test['naive_seasonal'] = df['Passengers'].iloc[120:132].values
test

In [None]:
ARIMA_pred = ARIMA_model_fit.get_prediction(132, 143).predicted_mean
test['ARIMA_pred'] = ARIMA_pred

- dobljeni rezultat bomo v nadaljevanju primerjali s predikcijo SARIMA modela.

### Forecasting with a SARIMA(p,d,q)(P,D,Q)m model

In [None]:
ad_fuller_result = adfuller(df['Passengers'])
print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')

In [None]:
df_diff = np.diff(df['Passengers'], n=1)
ad_fuller_result = adfuller(df_diff)
print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')

# podatki še vedno niso stacionarni

V tem primeru uporabimo sezonski diferencial -> stopnja diferenciala je enaka parametru sezonskosti

In [None]:
df_diff_seasonal_diff = np.diff(df_diff, n=12) # SEZONSKI DIFERENCIAL

ad_fuller_result = adfuller(df_diff_seasonal_diff)
print(f'ADF Statistic: {ad_fuller_result[0]}')
print(f'p-value: {ad_fuller_result[1]}')

- m = 12... sezonskost, perioda (glej tabelo zgoraj)
- d = 1 ... navaden diff ... df_diff = np.diff(df['Passengers'], n=1)
- D = 1 ...sezonski diff ... df_diff_seasonal_diff = np.diff(df_diff, n=12)

In [None]:
ps = range(0, 4, 1)
qs = range(0, 4, 1)
Ps = range(0, 4, 1)
Qs = range(0, 4, 1)

SARIMA_order_list = list(product(ps, qs, Ps, Qs))

train = df['Passengers'][:-12]

d = 1
D = 1
s = 12

SARIMA_result_df = optimize_SARIMA(train, SARIMA_order_list, d, D, s)
SARIMA_result_df

In [None]:
SARIMA_model = SARIMAX(train, order=(2,1,1), seasonal_order=(1,1,2,12), simple_differencing=False) # (p,d,q)  (P,D,Q,s=m)
SARIMA_model_fit = SARIMA_model.fit(disp=False)
SARIMA_model_fit.plot_diagnostics(figsize=(10,8));

In [None]:
from statsmodels.stats.diagnostic import acorr_ljungbox
residuals = SARIMA_model_fit.resid
res = acorr_ljungbox(residuals, np.arange(1, 11, 1))
print(list(res["lb_pvalue"]))

In [None]:
SARIMA_pred = SARIMA_model_fit.get_prediction(132, 143).predicted_mean

test['SARIMA_pred'] = SARIMA_pred
test

In [None]:
fig, ax = plt.subplots()

ax.plot(df['Month'], df['Passengers'])
ax.plot(test['Passengers'], 'b-', label='actual')
ax.plot(test['naive_seasonal'], 'r:', label='naive seasonal')
ax.plot(test['ARIMA_pred'], 'k--', label='ARIMA(11,2,3)')
ax.plot(test['SARIMA_pred'], 'g-.', label='SARIMA(2,1,1)(1,1,2,12)')

ax.set_xlabel('Date')
ax.set_ylabel('Number of air passengers')
ax.axvspan(132, 143, color='#808080', alpha=0.2)

ax.legend(loc=2)

plt.xticks(np.arange(0, 145, 12), np.arange(1949, 1962, 1))
ax.set_xlim(120, 143)

fig.autofmt_xdate()
plt.tight_layout()

In [None]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
mape_naive_seasonal = mape(test['Passengers'], test['naive_seasonal'])
mape_ARIMA = mape(test['Passengers'], test['ARIMA_pred'])
mape_SARIMA = mape(test['Passengers'], test['SARIMA_pred'])

print(mape_naive_seasonal, mape_ARIMA, mape_SARIMA)

In [None]:
fig, ax = plt.subplots()

x = ['naive seasonal', 'ARIMA(11,2,3)', 'SARIMA(2,1,1)(1,1,2,12)']
y = [mape_naive_seasonal, mape_ARIMA, mape_SARIMA]

ax.bar(x, y, width=0.4)
ax.set_xlabel('Models')
ax.set_ylabel('MAPE (%)')
ax.set_ylim(0, 15)

for index, value in enumerate(y):
    plt.text(x=index, y=value + 1, s=str(round(value,2)), ha='center')

plt.tight_layout()

## Summary

- The seasonal autoregressive integrated moving average model, denoted as
SARIMA(p,d,q)(P,D,Q)m, adds seasonal properties to the ARIMA(p,d,q) model.
- P is the order of the seasonal autoregressive process, D is the order of seasonal
integration, Q is the order of the seasonal moving average process, and m is the
frequency of the data.
- The frequency m corresponds to the number of observations in a cycle. If the
data is collected every month, then m = 12. If data is collected every quarter,
then m = 4.
- Time series decomposition can be used to identify seasonal patterns in a time
series.