# <center>Class 18: Forecasting Time Series</center>
# <center>Part Two:  Stochastic Modelling</center>

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import sys
import statsmodels
import patsy
import statsmodels.formula.api as smf
import warnings
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")

In [None]:
try:
    from pmdarima.arima import auto_arima
except:
    !pip install pmdarima
    from pmdarima.arima import auto_arima

In [None]:
%matplotlib inline

## Data

In [None]:
path = os.path.join(os.pardir, 'data', 'homeprices-data-2000-2018.csv') # this will produce a path with the right syntax for your operating system
path

In [None]:
# DATA IMPORT - FROM FILE
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.info()

### EDA

In [None]:
df = df.assign(date=lambda x: x.date.str[0:7])
df = df.rename({"pn": "p", "us": "u", "emps": "emp"}, axis=1)

In [None]:
df.sort_values(by=["date"], inplace= True)

In [None]:
df['dp'] = df.p.diff(1)
df['p_lag'] = df.p.shift(1)
df['lnp'] = np.log(df.p)
df['dlnp'] = df.lnp.diff(1)
df['lnp_lag'] = df.lnp.shift(1)
df['dlnp_lag'] = df.dlnp.shift(1)
df['du'] = df.u.diff(1)
df['lnemp'] = np.log(df.emp)
df['dlnemp'] = df.lnemp.diff(1)
df['trend'] = range(1, df.shape[0] + 1)

In [None]:
df.tail()

In [None]:
pd.to_datetime(df.date)

In [None]:
plt.subplots(figsize = (8,6))
plt.plot(pd.to_datetime(df.date), df.p)
plt.ylabel('Case-shiller Price index')
plt.grid(True, linestyle = ':');

**Log difference of price index**

In [None]:
plt.subplots(figsize = (8,6))
plt.plot(pd.to_datetime(df.date), df.dp)
plt.ylabel('first difference of the price index')
plt.hlines(0, xmin = pd.to_datetime(df.date).min(), xmax = pd.to_datetime(df.date).max(), color = 'k')
plt.grid(True, linestyle = ':');

Log difference of price index

In [None]:
plt.subplots(figsize = (8,6))
plt.plot(pd.to_datetime(df.date), df.dlnp)
plt.ylabel('log first difference of the price index')
plt.hlines(0, xmin = pd.to_datetime(df.date).min(), xmax = pd.to_datetime(df.date).max(), color = 'k')
plt.grid(True, linestyle = ':');

**Employment**

In [None]:
plt.subplots(figsize = (8,6))
plt.plot(pd.to_datetime(df.date), df.emp)
plt.ylabel('employment (in thousands)')
plt.grid(True, linestyle = ':');

Log diff employment

In [None]:
plt.subplots(figsize = (8,6))
plt.plot(pd.to_datetime(df.date), df.dlnemp)
plt.ylabel('log change in employment')
plt.hlines(0, xmin = pd.to_datetime(df.date).min(), xmax = pd.to_datetime(df.date).max(), color = 'k')
plt.grid(True, linestyle = ':');

Unemployment rate

In [None]:
plt.subplots(figsize = (8,6))
plt.plot(pd.to_datetime(df.date), df.u)
plt.ylabel('unemployment rate (in pct)')
plt.grid(True, linestyle = ':');

Unemployment 1st diff

In [None]:
plt.subplots(figsize = (8,6))
plt.plot(pd.to_datetime(df.date), df.du)
plt.ylabel('change in the unemployment rate')
plt.hlines(0, xmin = pd.to_datetime(df.date).min(), xmax = pd.to_datetime(df.date).max(), color = 'k')
plt.grid(True, linestyle = ':');

### Create work set and holdout set

- we start after the [GFC](https://en.wikipedia.org/wiki/2007%E2%80%932008_financial_crisis)

In [None]:
df["date"] = pd.to_datetime(df["date"])

**create work and holdout sets**

In [None]:
df_holdout = df[df.year == 2018]
df_work = df[df.year < 2018]

**create training and test sets for 4 folds**

In [None]:
for year in range(2013, 2018):
    fold = year - 2012
    df_work["test" + str(fold)] = df_work["year"] == year
    df_work["train" + str(fold)] = (df_work["year"] <= year - 1) & (
        df_work["year"] >= year - 13
    )

In [None]:
df_work.columns

In [None]:
df_work[df_work.train1]

In [None]:
df_work[df_work.test1]

In [None]:
df_work[df_work.train2]

In [None]:
df_work[df_work.test2]

### Modelling

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.api import VAR

**Model M1: OLS on trend & seasonality**

In [None]:
mse_1 = []
for i in range(1, 5):

    df_train = df_work.loc[lambda x: x["train" + str(i)] == 1]
    df_test = df_work.loc[lambda x: x["test" + str(i)] == 1]

    model1 = smf.ols("p ~ trend + C(month)", df_train).fit()

    phat = model1.predict(df_test)

    errsq = np.square(df_test.p.values - phat)

    mse_1.append(np.mean(errsq))

rmse_cv_m1 = np.sqrt(np.mean(mse_1))

In [None]:
rmse_cv_m1

In [None]:
print(model1.summary())

**Model M2: simple ARIMA(1,1,2)**

- get order from auto_arima

In [None]:
auto_arima_m2 = auto_arima(
    y = df_work.p,
    start_p = 0,
    max_p = 1,  # without this constrain, python returns a higher AR order
    # max_order=0,
    seasonal = False
)

In [None]:
auto_arima_m2

In [None]:
auto_arima_m2.get_params()

In [None]:
mse_2 = []

for i in range(1, 5):

    df_train = df_work.loc[lambda x: x["train" + str(i)] == 1]
    df_test = df_work.loc[lambda x: x["test" + str(i)] == 1]

    model2 = ARIMA(df_train.p, order=auto_arima_m2.get_params()["order"]).fit()

    phat = model2.forecast(steps=12)

    errsq = np.square(df_test.p.values - phat)

    mse_2.append(np.mean(errsq))

rmse_cv_m2 = np.sqrt(np.mean(mse_2))

In [None]:
rmse_cv_m2

**Model M3: p ARIMA(1,1,0)**

- get order from auto_arima

How to create dummies with Pandas

In [None]:
pd.get_dummies(df_work.month).iloc[0:12]

In the backend calculations `numpy` cannot correctly handle booleans, so a type conversion is necessary.

In [None]:
pd.get_dummies(df_work.month).iloc[0:12].astype(int)

In [None]:
auto_arima_m3 = auto_arima(
    y = df_work.p, 
    X = pd.get_dummies(df_work.month).astype(int), 
    seasonal = False,
    start_p = 0,
    max_p= 1
)

In [None]:
auto_arima_m3.get_params()

In [None]:
mse_3 = []
for i in range(1, 5):

    df_train = df_work.loc[lambda x: x["train" + str(i)] == 1]
    df_test = df_work.loc[lambda x: x["test" + str(i)] == 1]

    model3 = ARIMA(
        df_train.p,
        exog=pd.get_dummies(df_train.month),
        order=auto_arima_m3.get_params()["order"],
    ).fit()

    phat = model3.forecast(steps=12, exog=pd.get_dummies(df_test.month))

    errsq = np.square(df_test.p.values - phat)

    mse_3.append(np.mean(errsq))

rmse_cv_m3 = np.sqrt(np.mean(mse_3))

In [None]:
rmse_cv_m3

In [None]:
print(model3.summary())

**Model M4: p ARIMA(2,0,0) + seasonality + trend**

In [None]:
X = pd.get_dummies(df_work.month).astype(int)
X.columns = [str(x) for x in X.columns] # we need to convert numerical colnames to str as pmdarima cannot handle column names of mixed types
X['trend'] = df_work.trend
X

In [None]:
auto_arima_m4 = auto_arima(
    y = df_work.p,
    X = X,
    seasonal = False,
    start_p = 0,
    max_p = 2
)

In [None]:
auto_arima_m4.get_params()

In [None]:
mse_4 = []
for i in range(1, 5):

    df_train = df_work.loc[lambda x: x["train" + str(i)] == 1]
    df_test = df_work.loc[lambda x: x["test" + str(i)] == 1]

    model4 = ARIMA(
        df_train.p,
        exog = pd.get_dummies(df_train.month).astype(int),
        trend = 't', # 't' stands for a linear term
        order = auto_arima_m4.get_params()["order"],
    ).fit()

    phat = model4.forecast(steps=12, exog=pd.get_dummies(df_test.month), trend="t")

    errsq = np.square(df_test.p.values - phat)

    mse_4.append(np.mean(errsq))

rmse_cv_m4 = np.sqrt(np.mean(mse_4))

In [None]:
rmse_cv_m4

In [None]:
print(model4.summary())

**Model M5: dp ~ month + trend, without any ARIMA**

In [None]:
mse_5 = []
for i in range(1, 5):

    df_train = df_work.loc[lambda x: x["train" + str(i)] == 1]
    df_test = df_work.loc[lambda x: x["test" + str(i)] == 1]

    model5 = smf.ols("dp ~ trend + C(month)", df_train).fit()

    dphat = model5.predict(df_test)

    df_test["phat"] = None
    
    for i in range(0, 12):
        if i == 0:
            df_test.iloc[i, -1] = df_train["p"].values[-1] + dphat.iloc[i]
        else:
            df_test.iloc[i, -1] = df_test.iloc[i - 1, -1] + dphat.iloc[i]

    errsq = np.square(df_test["p"] - df_test["phat"])

    mse_5.append(np.mean(errsq))

rmse_cv_m5 = np.sqrt(np.mean(mse_5))

In [None]:
rmse_cv_m5

**Model M6: lnp ARIMA(1,2,2) + built-in seasonality using `auto_arima`**

In [None]:
auto_arima_m6 = auto_arima(
    y = df_work.lnp,
    d = 2,  # without this constrain, python returns other ARIMA order
    seasonal=True,
    m = 12
)

In [None]:
auto_arima_m6.get_params()

In [None]:
mse_6 = []
for i in range(1, 5):

    df_train = df_work.loc[lambda x: x["train" + str(i)] == 1]
    df_test = df_work.loc[lambda x: x["test" + str(i)] == 1]

    model6 = ARIMA(
        df_train.lnp,
        # exog=pd.get_dummies(df_train.month),
        order=auto_arima_m6.get_params()["order"],
    ).fit()

    lnphat = model6.forecast(steps=12, exog=pd.get_dummies(df_test.month))

    corrb = mean_squared_error(df_test.lnp, lnphat)

    phat = np.exp((lnphat + corrb / 2))

    errsq = np.square(df_test.p.values - phat)

    mse_6.append(np.mean(errsq))

rmse_cv_m6 = np.sqrt(np.mean(mse_6))

In [None]:
rmse_cv_m6

**Vector Autoregression**

In [None]:
mse_var = []
for i in range(1, 5):

    df_train = df_work.loc[lambda x: x["train" + str(i)] == 1, :].dropna()
    df_test = df_work.loc[lambda x: x["test" + str(i)] == 1, :].dropna()

    model7 = VAR(df_train[["dp", "du", "dlnemp"]]).fit(1)

    dphat = model7.forecast(
        df_train[["dp", "du", "dlnemp"]].values[-model7.k_ar :], steps=12
    )[:, 0]

    df_test["phat"] = None
    for i in range(0, 12):
        if i == 0:
            df_test.iloc[i, -1] = df_train["p"].values[-1] + dphat[i]
        else:
            df_test.iloc[i, -1] = df_test.iloc[i - 1, -1] + dphat[i]

    errsq = np.square(df_test["p"] - df_test["phat"])

    mse_var.append(np.mean(errsq))

rmse_cv_m7 = np.sqrt(np.mean(mse_var))

In [None]:
model7.k_ar

In [None]:
df_train[["dp", "du", "dlnemp"]].values[-model7.k_ar :]

In [None]:
model7.forecast(
        df_train[["dp", "du", "dlnemp"]].values[-model7.k_ar :], steps=12
    )

In [None]:
model7.forecast(
        df_train[["dp", "du", "dlnemp"]].values[-model7.k_ar :], steps=12
    )[:, 0]

In [None]:
rmse_cv_m7

#### Summary

*Note: some model's cv rmse differns from textbook*

In [None]:
pd.DataFrame(
    [mse_1, mse_2, mse_3, mse_4, mse_5, mse_6, mse_var],
    columns=["Fold" + str(i) for i in range(1, 5)],
).apply(np.sqrt).assign(
    Average=[
        rmse_cv_m1,
        rmse_cv_m2,
        rmse_cv_m3,
        rmse_cv_m4,
        rmse_cv_m5,
        rmse_cv_m6,
        rmse_cv_m7,
    ],
    model=["M" + str(i) for i in range(1, 7)] + ["M7 (var)"],
).round(
    2
).set_index(
    "model"
)

### Predict for holdout

**The best model is M4.**

In [None]:
auto_arima_m4.get_params()

- What's inside the model?

In [None]:
print(auto_arima_m4.summary())

In [None]:
auto_arima_m4.plot_diagnostics(figsize = (9,7));

- Re-estimate best models on full work set

In [None]:
model_final = ARIMA(
    df_work.p, 
    exog = pd.get_dummies(df_work.month).astype(int), 
    trend = "t", 
    order = auto_arima_m4.get_params()["order"]
).fit()

In [None]:
print(model_final.summary())

In [None]:
pred_final = model_final.get_forecast(
    steps=12, 
    exog = pd.get_dummies(df_holdout.month).astype(int),
    trend="t"
)

In [None]:
pred_final.predicted_mean

In [None]:
pred_final.conf_int()

In [None]:
forecast_holdout_best = (
    df_holdout.assign(
        p_pred = pred_final.predicted_mean.values, 
        model="best")
    .join(pred_final.conf_int(alpha=0.2))
    .filter(["model", "p_pred", "lower p", "upper p"])
)

In [None]:
forecast_holdout_best

In [None]:
df_plot = df.join(forecast_holdout_best).loc[lambda x: x.year >= 2015]

In [None]:
plt.subplots(figsize = (8,6))
plt.plot(df_plot.date, df_plot.p, color = 'k')
plt.plot(df_plot.date, df_plot.p_pred, color = 'darkblue', linestyle = '--')
plt.fill_between(df_plot.date, df_plot['lower p'], df_plot['upper p'], color = 'indianred', alpha = 0.5)
plt.legend(['actual', 'predicted', 'prediction interval'], loc = 'upper left', labelcolor = ['k', 'darkblue', 'indianred'])
plt.ylabel('2000 = 100')
plt.grid(True, linestyle = ':')
plt.yticks(range(220,340,10))
plt.title('Case-Shiller Home Price Index: Actual vs Prediction');

In [None]:
errsq = np.square(df_holdout.p.values - forecast_holdout_best.p_pred)

rmse_holdout = np.mean(errsq)
rmse_holdout