# <center>Class 18: Forecasting Time Series </center>
## <center>Part One </center>

In [None]:
import pandas as pd
import numpy as np
import warnings
import sys
import os
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from patsy import dmatrices
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

warnings.filterwarnings("ignore")

In [None]:
try: 
    import holidays
except:
    !pip install holidays
    import holidays

In [None]:
try:
    from prophet import Prophet
except: 
    !pip install Prophet
    from prophet import Prophet

from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics

In [None]:
%matplotlib inline

## Data

In [None]:
path = os.path.join(os.pardir, 'data', 'swim_work.csv') # this will produce a path with the right syntax for your operating system
path

In [None]:
# DATA IMPORT - FROM FILE
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.head()

In [None]:
df.info()

### EDA

In [None]:
df_daily_agg = df.copy()

In [None]:
df_daily_agg.date = pd.to_datetime(df_daily_agg.date, format= '%Y-%m-%d')

In [None]:
df_daily_agg.info()

In [None]:
df_daily_agg["year"] = df_daily_agg["date"].dt.year
df_daily_agg["quarter"] = df_daily_agg["date"].dt.quarter
df_daily_agg["month"] = df_daily_agg["date"].dt.month
df_daily_agg["day"] = df_daily_agg["date"].dt.day
df_daily_agg["dow"] = df_daily_agg["date"].dt.dayofweek + 1
df_daily_agg["weekend"] = df_daily_agg["dow"].isin([6, 7])

In [None]:
df_daily_agg["school_off"] = (
    ((df_daily_agg["day"] > 15) & (df_daily_agg["month"] == 5) & (df_daily_agg["day"] <= 30))
    | ((df_daily_agg["month"] == 6) | (df_daily_agg["month"] == 7))
    | ((df_daily_agg["day"] < 15) & (df_daily_agg["month"] == 8))
    | ((df_daily_agg["day"] > 20) & (df_daily_agg["month"] == 12))
)

In [None]:
df_daily_agg["trend"] = df_daily_agg.index + 1

In [None]:
# Get holiday calendar ----------------------------------

In [None]:
minyear = df_daily_agg.date.min().year
maxyear = df_daily_agg.date.max().year

In [None]:
usholidays = holidays.UnitedStates(years = [x for x in range(minyear, maxyear + 1, 1)])

In [None]:
for dat in usholidays.items():
    print(dat)

In [None]:
holiday_days = [x[0] for x in usholidays.items()]

In [None]:
holiday_days[0:10]

In [None]:
df_daily_agg.date[0]

In [None]:
df_daily_agg["isHoliday"] = df_daily_agg["date"].isin(holiday_days)

Did we get the holidays right?

In [None]:
df_daily_agg[df_daily_agg.date.isin([datetime(2010,7,4), datetime(2012,12,4), datetime(2012,12,25)])]

Is *Maria Himmelfahrt* a holiday in the US?

In [None]:
df_daily_agg[df_daily_agg.date == datetime(2010,8,15)]

### Define vars for analysis

In [None]:
df_daily_agg["q_month"] = df_daily_agg.groupby("month")["QUANTITY"].transform("mean")
df_daily_agg["QUANTITY2"] = np.where(df_daily_agg["QUANTITY"] < 1, 1, df_daily_agg["QUANTITY"])
df_daily_agg["q_ln"] = np.log(df_daily_agg["QUANTITY2"])
df_daily_agg["tickets"] = df_daily_agg.groupby(["month", "dow"])["QUANTITY"].transform("mean")
df_daily_agg["tickets_ln"] = df_daily_agg.groupby(["month", "dow"])["q_ln"].transform("mean")
df_daily_agg["dow_abb"] = df_daily_agg["date"].dt.day_name().str[:3]
df_daily_agg["month_abb"] = df_daily_agg["date"].dt.month_name().str[:3]

In [None]:
df_daily_agg.head()

In [None]:
df_daily_agg.tail().T

## Descriptive graphs

In [None]:
df_daily_agg[df_daily_agg.year == 2015].plot(
    kind = 'line', figsize = (8,6),
    x = 'date', y = 'QUANTITY', 
    grid = True, legend = False, title = 'Daily ticket sales in 2015');

In [None]:
df_daily_agg[df_daily_agg.year < 2016].plot(
    kind = 'line', figsize = (8,6),
    x = 'date', y = 'QUANTITY', linewidth = 0.5,
    grid = True, legend = False, title = 'Daily ticket sales between 2010-2015');

In [None]:
ax = sns.boxplot(data= df_daily_agg, x = 'month', y = 'QUANTITY')
ax.set_ylabel('Daily ticket sales')
ax.set_title('Ticket sales distribution by month');

In [None]:
ax = sns.boxplot(data= df_daily_agg, x = 'dow', y = 'QUANTITY', color = 'grey')
ax.set_ylabel('Daily ticket sales')
ax.set_title('Ticket sales distribution by day of week');

In [None]:
df_daily_agg.pivot_table(index="month", columns="dow", values="QUANTITY", aggfunc='sum')

In [None]:
df_daily_agg.pivot_table(index="month", columns="dow", values="QUANTITY", aggfunc='mean')

In [None]:
sns.heatmap(
    data = df_daily_agg.pivot_table(index="month", columns="dow", values="QUANTITY", aggfunc='mean'), 
    annot = True, 
    # which colormap do you prefer?
    # cmap = 'turbo',
    cmap = 'coolwarm',
    fmt = '.0f');

`matplotlib` colormaps [here](https://matplotlib.org/stable/users/explain/colors/colormaps.html)

### Prediction

#### Create train/holdout data

In [None]:
df_daily_agg.info()

In [None]:
factor_cols = ["month", "dow", "isHoliday", "school_off"]

df_daily_agg[factor_cols] = df_daily_agg[factor_cols].astype("category")
df_holdout = df_daily_agg.loc[df_daily_agg['year']==2016,:]
df_train = df_daily_agg.loc[df_daily_agg['year']<2016,:]

In [None]:
df_daily_agg.info()

In [None]:
df_train.tail()

In [None]:
logo = LeaveOneGroupOut()
groups = df_train.loc[:,'year'].to_numpy()
groups

Note: `LeaveOneOut()` is equivalent to `KFold(n_splits=n)`

In [None]:
df_train.year.unique()

#### Linear regression

In [None]:
lin_reg = LinearRegression(fit_intercept=False)

LeaveOneGroupOut object's `split` method:
- **X**: array-like of shape (n_samples, n_features); training data, where n_samples is the number of samples and n_features is the number of features.
- **y**: object; always ignored, exists for compatibility.
- **groups**: object; always ignored, exists for compatibility.

In [None]:
y, X = dmatrices("QUANTITY ~ 1 + trend + month", df_train)
for train_index, test_index in logo.split(X, y, groups):
    print(train_index, '\n', test_index)
    print('--------')

In [None]:
def fit_cv_model_get_rmse(y, X, groups):
    rmse_folds = []
    for train_index, test_index in logo.split(X, y, groups):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        lin_reg.fit(X_train, y_train)
        y_hat = lin_reg.predict(X_test)
        rmse_folds.append(mean_squared_error(y_test, y_hat, squared=False))

    return np.mean(rmse_folds)

**Model 1: linear trend + monthly seasonality**

In [None]:
%%time
y, X = dmatrices("QUANTITY ~ 1 + trend + month", df_train)

In [None]:
X

In [None]:
rmse_reg1 = fit_cv_model_get_rmse(y, X, groups)

In [None]:
rmse_reg1

**Model 2: linear trend + monthly seasonality + days of week seasonality**

In [None]:
y,X = dmatrices("QUANTITY ~ 1+ trend + month + dow",df_train)

rmse_reg2 = fit_cv_model_get_rmse(y, X, groups)
rmse_reg2

**Model 3: linear trend + monthly seasonality + days of week  seasonality + holidays**

In [None]:
y,X = dmatrices("QUANTITY ~ 1 + trend + month + dow + isHoliday",df_train)

rmse_reg3 = fit_cv_model_get_rmse(y, X, groups)
rmse_reg3

**Model 4: linear trend + monthly seasonality + days of week  seasonality + holidays + sch$*$dow**

In [None]:
y,X = dmatrices("QUANTITY ~ 1 + trend + month + dow + isHoliday + school_off*dow", df_train)

rmse_reg4 = fit_cv_model_get_rmse(y, X, groups)
rmse_reg4

**Model 5: linear trend + monthly seasonality + days of week  seasonality + holidays + interactions**

In [None]:
y, X = dmatrices(
    "QUANTITY ~ 1 + trend + month + dow + isHoliday + school_off*dow+ weekend*month",
    df_train,
)

rmse_reg5 = fit_cv_model_get_rmse(y, X, groups)
rmse_reg5

In [None]:
df_train2 = df_train[df_train.QUANTITY >= 1]
groups = df_train2.loc[:,'year'].to_numpy()

Note: we could have done:
```python
groups = df_train2.year.to_numpy()
```

**Model 6: trend + monthly seasonality + days of week seasonality + holidays + interactions**

Why is it different than Model 5?

In [None]:
y, X = dmatrices(
    "q_ln ~ 1 + trend + month + dow +school_off*dow", df_train2
)

rmse_folds = []
for train_index, test_index in logo.split(X, y, groups):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lin_reg.fit(X_train, y_train)
    y_hat = lin_reg.predict(X)

    corrb = mean_squared_error(y , y_hat)

    y_hat = np.exp((lin_reg.predict(X_test) + corrb / 2))
    rmse_folds.append(mean_squared_error(np.exp(y_test), y_hat, squared=False))

rmse_reg6 = np.mean(rmse_folds)
rmse_reg6

#### Prophet

Cross-validation with `Prophet` done with prophet: https://facebook.github.io/prophet/docs/diagnostics.html. This is a *time-series-based (!!!)* cross-validation.

*Question*: why are we building an additive model?

In [None]:
model_prophet = Prophet(
    seasonality_mode="additive",
    yearly_seasonality="auto",
    weekly_seasonality="auto",
    growth="linear",
    daily_seasonality=True,
)

model_prophet = Prophet.add_country_holidays(model_prophet,"US")

In [None]:
model_prophet = Prophet.fit(
    model_prophet,
    df=df_train[["date", "QUANTITY"]].rename({"date": "ds", "QUANTITY": "y"}, axis=1),
)

In [None]:
cv_pred = cross_validation(
    model_prophet, 
    initial="365 days", 
    period="365 days", 
    horizon="365 days"
)

In [None]:
cv_pred

In [None]:
performance_metrics(cv_pred,rolling_window = 1)

In [None]:
rmse_prophet_cv = performance_metrics(cv_pred, rolling_window = 1)["rmse"][0]
rmse_prophet_cv

Note: M6 log model rmse is slightly different from the one found in the book

In [None]:
pd.DataFrame(
    [rmse_reg1, rmse_reg2, rmse_reg3, rmse_reg4, rmse_reg5, rmse_reg6, rmse_prophet_cv],
    ["M" + str(i) for i in range(1, 6)] + ["M6 (log)", "M7 (Prophet)"],
    columns=["RMSE"],
).round(2)

### Evaluate best model on the holdout set

In [None]:
lin_reg = LinearRegression(fit_intercept=False)

y, X = dmatrices(
    "QUANTITY ~ 1 + trend + month + dow + isHoliday + school_off*dow+ weekend*month",
    df_train,
)

lin_reg.fit(X, y)

_, X_holdout = dmatrices(
    "QUANTITY ~ 1 + trend + month + dow + isHoliday + school_off*dow+ weekend*month",
    df_holdout,
)

*Question*: what is '`_`' in the previous code chunk?

In [None]:
df_holdout["y_hat_5"] = lin_reg.predict(X_holdout)

In [None]:
rmse_holdout_best = mean_squared_error(
    df_holdout.QUANTITY, 
    df_holdout.y_hat_5, 
    squared=False # default: True > returns MSE
)
rmse_holdout_best

#### Detour: interpreting regression coefficients. What are the actual seasonal effects in ticket sales?

In [None]:
y, X = dmatrices(
    "QUANTITY ~ trend + month + dow + isHoliday + school_off*dow+ weekend*month",
    df_train,
)

In [None]:
best_ols_results = sm.OLS(y, X).fit()

In [None]:
print(best_ols_results.summary2())

<br>

- What does it mean?

```
"The smallest eigenvalue is .... This might indicate that there are 
strong multicollinearity problems or that the design matrix is singular" 

```
.

In [None]:
best_ols_results.eigenvals.min()

In [None]:
np.max(best_ols_results.eigenvals) / np.min(best_ols_results.eigenvals)

In [None]:
plt.hist(best_ols_results.resid, bins = 51, rwidth = 0.9);

### Plot best predictions

Relative RMSE on the holdout set per month

In [None]:
group = df_holdout.sort_values(by=["month"]).groupby("month")

In [None]:
type(group)

In [None]:
group.apply(lambda x: mean_squared_error(x.QUANTITY, x.y_hat_5, squared=False))

In [None]:
rmse_monthly = pd.DataFrame(
    [
        group.apply(lambda x: mean_squared_error(x.QUANTITY, x.y_hat_5, squared=False)),
        group.apply(
            lambda x: mean_squared_error(x.QUANTITY, x.y_hat_5, squared=False) / np.mean(x.QUANTITY)
        ),
    ],
    index=["RMSE", "RMSE_norm"],
).T.reset_index()

In [None]:
rmse_monthly

In [None]:
ax = sns.barplot(data = rmse_monthly, x = 'month', y = 'RMSE_norm')
ax.grid(visible = True, axis = 'y')
ax.set_ylabel('normalized RMSE')
ax.set_title('Normalized RMSE across months');

In [None]:
x = df_holdout[(df_holdout.year == 2016) & (df_holdout.month == 8)].date
y1 = df_holdout[(df_holdout.year == 2016) & (df_holdout.month == 8)].QUANTITY
y2 = df_holdout[(df_holdout.year == 2016) & (df_holdout.month == 8)].y_hat_5

In [None]:
plt.subplots(figsize = (8,6))
plt.plot(x, y1, color = 'k')
plt.plot(x, y2, color = 'k', linestyle = '--')
plt.fill_between(x, y1, y2, color = 'lightblue')
plt.legend(['actual', 'predicted'])
plt.ylabel('tickets sold')
plt.grid(True)
plt.title('Actual vs predicted ticket sales, August 2016')
plt.xticks(x.tolist()[0::7]);