### Chapter 18
**CH18B Forecasting a home price index**

using case-schiller-la dataset

version 1.1 2024-01-11

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from mizani.breaks import date_breaks
from mizani.formatters import date_format
from plotnine import *
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import sys
import statsmodels
import patsy
from pmdarima.arima import auto_arima
import statsmodels.formula.api as smf
import warnings
from sklearn.metrics import mean_squared_error

warnings.filterwarnings("ignore")

In [None]:
%matplotlib inline

### Get Data

In [None]:
# Current script and repository folder
current_path = os.getcwd()
repository_path = current_path.split('Ch18')[0]

In [None]:
# Add utils folder to sys path 
# Note: os.path.join() creates a string with the right syntax for defining a path for your operating sytem.
sys.path.append(os.path.join(repository_path, 'utils'))

In [None]:
# Define data folder
data_path = os.path.join(repository_path, 'data')

In [None]:
# Import the prewritten helper functions
from py_helper_functions import *

In [None]:
# DATA IMPORT - FROM GITHUB
data = pd.read_csv('https://raw.githubusercontent.com/peterduronelly/DA3-Python-Codes/main/data/homeprices-data-2000-2018.csv')

In [None]:
data.head()

In [None]:
data.info()

### EDA

In [None]:
data = data.assign(date=lambda x: x.date.str[0:7])
data = data.rename({"pn": "p", "us": "u", "emps": "emp"}, axis=1)

In [None]:
data.sort_values(by=["date"], inplace= True)

In [None]:
data['dp'] = data.p.diff(1)
data['p_lag'] = data.p.shift(1)
data['lnp'] = np.log(data.p)
data['dlnp'] = data.lnp.diff(1)
data['lnp_lag'] = data.lnp.shift(1)
data['dlnp_lag'] = data.dlnp.shift(1)
data['du'] = data.u.diff(1)
data['lnemp'] = np.log(data.emp)
data['dlnemp'] = data.lnemp.diff(1)
data['trend'] = range(1, data.shape[0] + 1)

In [None]:
data.tail()

In [None]:
pd.to_datetime(data.date)

In [None]:
limits = datetime(2000, 1, 1), datetime(2019, 1, 1)
breaks = date_breaks("1 year")

price_index_plot = (
    ggplot(data, aes(x="date", y="p", group=1))
    + geom_line(color=color[0], size=1)
    + scale_y_continuous(limits=[50, 300], breaks=seq(50, 301, 50))
    + scale_x_date(breaks=breaks(limits)[::3], labels=date_format("%b%Y"))
    + labs(y="Case-shiller Price index", x="Date (month)")
    + theme_bw()
)
price_index_plot

In [None]:
plt.subplots(figsize = (8,6))
plt.plot(pd.to_datetime(data.date), data.p)
plt.ylabel('Case-shiller Price index')
plt.grid(True);

**Log difference of price index**

In [None]:
dp_plot = (
    ggplot(data, aes(x="date", y="dp", group=1))
    + geom_line(color=color[0], size=1)
    + scale_y_continuous(limits=[-10, 8], breaks=seq(-10, 9, 2))
    + scale_x_date(breaks=breaks(limits)[::3], labels=date_format("%b%Y"))
    + labs(y="First difference of price index", x="Date (month)")
    + theme_bw()
)
dp_plot

In [None]:
plt.subplots(figsize = (8,6))
plt.plot(pd.to_datetime(data.date), data.dp)
plt.ylabel('first difference of the price index')
plt.hlines(0, xmin = pd.to_datetime(data.date).min(), xmax = pd.to_datetime(data.date).max(), color = 'k')
plt.grid(True);

Log difference of price index

In [None]:
limits = datetime(2000, 1, 1), datetime(2018, 1, 1)
breaks = date_breaks("1 year")

dlnp_plot = (
    ggplot(data, aes(x="date", y="dlnp", group=1))
    + geom_line(color=color[0], size=1)
    + scale_y_continuous(limits=[-0.04, 0.04], breaks=seq(-0.04, 0.05, 0.01))
    + scale_x_date(breaks=breaks(limits)[::3], labels=date_format("%b%Y"))
    + labs(y="Log first difference of price index", x="Date (month)")
    + theme_bw()
)
dlnp_plot

**Employment**

In [None]:
emp_plot = (
    ggplot(data, aes(x="date", y="emp", group=1))
    + geom_line(color=color[0], size=1)
    + scale_x_date(breaks=breaks(limits)[::3], labels=date_format("%b%Y"))
    + labs(y="Employment (in thousands)", x="Date (month)")
    + theme_bw()
)
emp_plot

Log diff employment

In [None]:
lnemp_plot = (
    ggplot(data, aes(x="date", y="dlnemp", group=1))
    + geom_line(color=color[0], size=1)
    + scale_x_date(breaks=breaks(limits)[::3], labels=date_format("%b%Y"))
    + labs(y="Log change in employment", x="Date (month)")
    + theme_bw()
)

lnemp_plot

Unemployment rate

In [None]:
u_plot = (
    ggplot(data, aes(x="date", y="u", group=1))
    + geom_line(color=color[0], size=1)
    + scale_x_date(breaks=breaks(limits)[::3], labels=date_format("%b%Y"))
    + labs(y="Unemployment rate (percent)", x="Date (month)")
    + theme_bw()
)

u_plot

Unemployment 1st diff

In [None]:
du_plot = (
    ggplot(data, aes(x="date", y="du", group=1))
    + geom_line(color=color[0], size=1)
    + scale_x_date(breaks=breaks(limits)[::3], labels=date_format("%b%Y"))
    + labs(y="Change in unemployment rate", x="Date (month)")
    + theme_bw()
)

du_plot

### Create work set and holdout set

- we start after the [GFC](https://en.wikipedia.org/wiki/2007%E2%80%932008_financial_crisis)

In [None]:
data["date"] = pd.to_datetime(data["date"])

**create work and holdout sets**

In [None]:
data_holdout = data[data.year == 2018]
data_work = data[data.year < 2018]

**create training and test sets for 4 folds**

In [None]:
for year in range(2013, 2018):
    fold = year - 2012
    data_work["test" + str(fold)] = data_work["year"] == year
    data_work["train" + str(fold)] = (data_work["year"] <= year - 1) & (
        data_work["year"] >= year - 13
    )

In [None]:
data_work.columns

In [None]:
data_work[data_work.train1]

In [None]:
data_work[data_work.test1]

In [None]:
data_work[data_work.train2]

In [None]:
data_work[data_work.test2]

### Modelling

In [None]:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.api import VAR

**Model M1: OLS on trend & seasonality**

In [None]:
mse_1 = []
for i in range(1, 5):

    train_data = data_work.loc[lambda x: x["train" + str(i)] == 1]
    test_data = data_work.loc[lambda x: x["test" + str(i)] == 1]

    model1 = smf.ols("p ~ trend + C(month)", train_data).fit()

    phat = model1.predict(test_data)

    errsq = np.square(test_data.p.values - phat)

    mse_1.append(np.mean(errsq))

rmse_cv_m1 = np.sqrt(np.mean(mse_1))

**Model M2: p ARIMA(1,1,2)**

- get order from auto_arima

In [None]:
auto_arima_m2 = auto_arima(
    y = data_work.p,
    start_p = 0,
    max_p = 1,  # without this constrain, python returns a higher AR order
    # max_order=0,
    seasonal = False
)

mse_2 = []

for i in range(1, 5):

    train_data = data_work.loc[lambda x: x["train" + str(i)] == 1]
    test_data = data_work.loc[lambda x: x["test" + str(i)] == 1]

    model2 = ARIMA(train_data.p, order=auto_arima_m2.get_params()["order"]).fit()

    phat = model2.forecast(steps=12)

    errsq = np.square(test_data.p.values - phat)

    mse_2.append(np.mean(errsq))

rmse_cv_m2 = np.sqrt(np.mean(mse_2))

In [None]:
rmse_cv_m2

In [None]:
auto_arima_m2.get_params()

**Model M3: p ARIMA(1,1,0)**

- get order from auto_arima

How to create dummies with Pandas

In [None]:
pd.get_dummies(data_work.month).iloc[0:12]

In the backend caluclations `numpy` cannot correctly handle booleans, so a type conversion is necessary.

In [None]:
pd.get_dummies(data_work.month).iloc[0:12].astype(int)

In [None]:
auto_arima_m3 = auto_arima(
    y = data_work.p, 
    X = pd.get_dummies(data_work.month).astype(int), 
    seasonal = False
)

In [None]:
auto_arima_m3.get_params()

In [None]:
mse_3 = []
for i in range(1, 5):

    train_data = data_work.loc[lambda x: x["train" + str(i)] == 1]
    test_data = data_work.loc[lambda x: x["test" + str(i)] == 1]

    model3 = ARIMA(
        train_data.p,
        exog=pd.get_dummies(train_data.month),
        order=auto_arima_m3.get_params()["order"],
    ).fit()

    phat = model3.forecast(steps=12, exog=pd.get_dummies(test_data.month))

    errsq = np.square(test_data.p.values - phat)

    mse_3.append(np.mean(errsq))

rmse_cv_m3 = np.sqrt(np.mean(mse_3))

In [None]:
rmse_cv_m3

**Model M4: p ARIMA(2,0,0) + seasonality + trend**

In [None]:
X = pd.get_dummies(data_work.month).astype(int)
X.columns = [str(x) for x in X.columns] # we need to convert numerical colnames to str as pmdarima cannot handle column names of mixed types
X['trend'] = data_work.trend
X

In [None]:
auto_arima_m4 = auto_arima(
    y = data_work.p,
    X = X,
    seasonal = False,
)

In [None]:
auto_arima_m4.get_params()

In [None]:
mse_4 = []
for i in range(1, 5):

    train_data = data_work.loc[lambda x: x["train" + str(i)] == 1]
    test_data = data_work.loc[lambda x: x["test" + str(i)] == 1]

    model4 = ARIMA(
        train_data.p,
        exog = pd.get_dummies(train_data.month).astype(int),
        trend = 't', # 't' stands for a linear term
        order = auto_arima_m4.get_params()["order"],
    ).fit()

    phat = model4.forecast(steps=12, exog=pd.get_dummies(test_data.month), trend="t")

    errsq = np.square(test_data.p.values - phat)

    mse_4.append(np.mean(errsq))

rmse_cv_m4 = np.sqrt(np.mean(mse_4))

In [None]:
rmse_cv_m4

**Model M5: dp ~ month + trend, without any ARIMA**

In [None]:
mse_5 = []
for i in range(1, 5):

    train_data = data_work.loc[lambda x: x["train" + str(i)] == 1]
    test_data = data_work.loc[lambda x: x["test" + str(i)] == 1]

    model5 = smf.ols("dp ~ trend + C(month)", train_data).fit()

    dphat = model5.predict(test_data)

    test_data["phat"] = None
    
    for i in range(0, 12):
        if i == 0:
            test_data.iloc[i, -1] = train_data["p"].values[-1] + dphat.iloc[i]
        else:
            test_data.iloc[i, -1] = test_data.iloc[i - 1, -1] + dphat.iloc[i]

    errsq = np.square(test_data["p"] - test_data["phat"])

    mse_5.append(np.mean(errsq))

rmse_cv_m5 = np.sqrt(np.mean(mse_5))

In [None]:
rmse_cv_m5

**Model M6: lnp ARIMA(0,2,0) + built-in seasonality using `auto_arima`**

In [None]:
auto_arima_m6 = auto_arima(
    y = data_work.lnp,
    d = 2,  # without this constrain, python returns other ARIMA order
    seasonal=True,
    m = 12
)

In [None]:
auto_arima_m6.get_params()

In [None]:
mse_6 = []
for i in range(1, 5):

    train_data = data_work.loc[lambda x: x["train" + str(i)] == 1]
    test_data = data_work.loc[lambda x: x["test" + str(i)] == 1]

    model6 = ARIMA(
        train_data.lnp,
        # exog=pd.get_dummies(train_data.month),
        order=auto_arima_m6.get_params()["order"],
    ).fit()

    lnphat = model6.forecast(steps=12, exog=pd.get_dummies(test_data.month))

    corrb = mean_squared_error(test_data.lnp, lnphat)

    phat = np.exp((lnphat + corrb / 2))

    errsq = np.square(test_data.p.values - phat)

    mse_6.append(np.mean(errsq))

rmse_cv_m6 = np.sqrt(np.mean(mse_6))

In [None]:
rmse_cv_m6

**Vector Autoregression**

In [None]:
mse_var = []
for i in range(1, 5):

    train_data = data_work.loc[lambda x: x["train" + str(i)] == 1, :].dropna()
    test_data = data_work.loc[lambda x: x["test" + str(i)] == 1, :].dropna()

    model7 = VAR(train_data[["dp", "du", "dlnemp"]]).fit(1)

    dphat = model7.forecast(
        train_data[["dp", "du", "dlnemp"]].values[-model7.k_ar :], steps=12
    )[:, 0]

    test_data["phat"] = None
    for i in range(0, 12):
        if i == 0:
            test_data.iloc[i, -1] = train_data["p"].values[-1] + dphat[i]
        else:
            test_data.iloc[i, -1] = test_data.iloc[i - 1, -1] + dphat[i]

    errsq = np.square(test_data["p"] - test_data["phat"])

    mse_var.append(np.mean(errsq))

rmse_cv_m7 = np.sqrt(np.mean(mse_var))

In [None]:
rmse_cv_m7

#### Summary

*Note: some model's cv rmse differns from textbook*

In [None]:
pd.DataFrame(
    [mse_1, mse_2, mse_3, mse_4, mse_5, mse_6, mse_var],
    columns=["Fold" + str(i) for i in range(1, 5)],
).apply(np.sqrt).assign(
    Average=[
        rmse_cv_m1,
        rmse_cv_m2,
        rmse_cv_m3,
        rmse_cv_m4,
        rmse_cv_m5,
        rmse_cv_m6,
        rmse_cv_m7,
    ],
    model=["M" + str(i) for i in range(1, 7)] + ["M7 (var)"],
).round(
    2
).set_index(
    "model"
)

### Predict for holdout

**The best model is M4.**

In [None]:
auto_arima_m4.get_params()

- What's inside the model?

In [None]:
print(auto_arima_m4.summary())

In [None]:
auto_arima_m4.plot_diagnostics(figsize = (9,7));

- Re-estimate best models on full work set

In [None]:
model_final = ARIMA(
    data_work.p, 
    exog = pd.get_dummies(data_work.month).astype(int), 
    trend = "t", 
    order = auto_arima_m4.get_params()["order"]
).fit()

In [None]:
print(model_final.summary())

In [None]:
pred_final = model_final.get_forecast(
    steps=12, 
    exog = pd.get_dummies(data_holdout.month).astype(int),
    trend="t"
)

In [None]:
data_holdout

In [None]:
forecast_holdout_best = (
    data_holdout.assign(
        p_pred = pred_final.predicted_mean.values, 
        model="best")
    .join(pred_final.conf_int(alpha=0.2))
    .filter(["model", "p_pred", "lower p", "upper p"])
)

In [None]:
forecast_holdout_best

In [None]:
data_plot = data.join(forecast_holdout_best).loc[lambda x: x.year >= 2015]

In [None]:
data_plot_1 = pd.melt(data_plot[["date","p","p_pred"]],["date"])

In [None]:
data_plot_1

In [None]:
pred_p_plot = (
    ggplot(data_plot_1, aes(x="date", y="value", color="variable"))
    + geom_line(size=1)
    + ylab("Case-Shiller Home Price Index")
    + xlab("Date (month)")
    + scale_color_manual(
        name=" ", values=(color[0], color[1]), labels=("Actual", "Prediction")
    )
    + scale_x_date(date_breaks="1 years", labels=date_format("%b%Y"))
    + theme_bw()
    + theme(legend_position=(0.7, 0.3), legend_direction="horizontal")
)
pred_p_plot

In [None]:
pred_p_plot = (
    ggplot(data_plot, aes(x="date"))
    + geom_line(aes(y="p"), color=color[0], size=0.7)
    + geom_line(aes(y="p_pred"), color=color[1], size=1)
    + geom_line(aes(y="lower p"), color=color[1], size=0)
    + geom_line(aes(y="upper p"), color=color[1], size=0)
    + geom_ribbon(aes(ymin="lower p", ymax="upper p"), fill="green", alpha=0.3)
    + ylab("Case-Shiller Home Price Index")
    + xlab("Date (month)")
    + theme_bw()
    + scale_x_date(date_breaks="1 years", labels=date_format("%b%Y"))
)
pred_p_plot

In [None]:
plt.subplots(figsize = (8,6))
plt.plot(data_plot.date, data_plot.p, color = 'k')
plt.plot(data_plot.date, data_plot.p_pred, color = 'darkblue', linestyle = '--')
plt.fill_between(data_plot.date, data_plot['lower p'], data_plot['upper p'], color = 'darkblue', alpha = 0.5)
plt.legend(['actual', 'predicted', 'prediction interval'], loc = 'upper left')
plt.ylabel('2000 = 100')
plt.grid(True)
plt.yticks(range(220,340,10))
plt.title('Case-Shiller Home Price Index: Actual vs Prediction');

In [None]:
errsq = np.square(data_holdout.p.values - forecast_holdout_best.p_pred)

rmse_holdout = np.mean(errsq)
rmse_holdout