# TrainFinalModel
Sklearn installation fails on Mac Silicon. Always a mess. Did this:

```
brew install openblas
export OPENBLAS=$(/opt/homebrew/bin/brew --prefix openblas)
export CFLAGS="-falign-functions=8 ${CFLAGS}"
# ^ no need to add to .zshrc, just doing this once.
pip install scikit-learn
```
[Source](https://github.com/scipy/scipy/issues/13409)

Other ways...
```
pip install cython pybind11 pythran numpy
OPENBLAS=$(brew --prefix openblas) CFLAGS="-falign-functions=8 ${CFLAGS}" pip install --no-use-pep517 scipy==1.3.2
```
or
```
pip3 install -U --no-use-pep517 scikit-learn
```


In [8]:
import math
from datetime import datetime
from datetime import timedelta
import pandas as pd
from pathlib import Path
from prophet import Prophet
from prophet.plot import add_changepoints_to_plot
from prophet.utilities import regressor_coefficients
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from matplotlib import pyplot as plt
from prophet.serialize import model_to_json, model_from_json
import math

In [3]:
# https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Switzerland
COVID_START = datetime(2020, 3, 1).date()
# https://www.admin.ch/gov/en/start/documentation/media-releases.msg-id-84127.html
# date taken as the second easing of restrictions during 2021
COVID_END = datetime(2021, 6, 1).date()

MODEL_PATH = Path("../data/model/")
TRAIN_DATA_PATH = Path('../data/consumption/final_train.parquet')

MODEL_NAME = "totalconsumption_rolling7day.json"
CONSUMPTION_EXPR = "NE5Consumption + NE7Consumption" # i.e. "NE5Consumption + NE7Consumption", "NE5Consumption", "NE7Consumption"
ROLLING_WINDOW = 7
CONFIDENCE_INTERVAL = 0.95
USE_WEEK_SEASONALITY = False
USE_HOLIDAYS = False

In [None]:

def loadParquetToPandas(path):
    """
    return (spark.read.format("parquet").load(path)
                                      .withColumn("y", f.expr(CONSUMPTION_EXPR))
                                      .withColumn("CosYearTemp", f.expr("Temperature * cos(dayofyear(Date) * 2 * pi() / 365)"))
                                      .withColumn("SinYearTemp", f.expr("Temperature * sin(dayofyear(Date) * 2 * pi() / 365)"))
                                      .selectExpr("Date as ds", "y", "CosYearTemp", "SinYearTemp")
                                      .orderBy("ds")
         ).toPandas().rolling(ROLLING_WINDOW, on="ds").mean().dropna()[:-ROLLING_WINDOW]
    """
    df = pd.read_parquet(path, engine='pyarrow')
    df['Date'] = pd.to_datetime(df['Date'])
    df['y'] = df['NE5Consumption'] + df['NE7Consumption']
    df['CosYearTemp'] = df['Temperature'] * math.cos(df['Date'].dt.dayofyear * 2 * math.pi / 365)
    df['SinYearTemp'] = df['Temperature'] * math.sin(df['Date'].dt.dayofyear * 2 * math.pi / 365)
    df['ds'] = df['Date']
    df = df.sort_values('ds')
    return df.rolling(ROLLING_WINDOW, on='ds').mean().dropna()[:-ROLLING_WINDOW]

train = loadParquetToPandas(TRAIN_DATA_PATH)

In [7]:
# add covid as a one-off holiday
covid = pd.DataFrame([
    {'holiday': 'covid', 'ds': COVID_START, 'lower_window': 0, 'ds_upper': COVID_END}
])

covid['upper_window'] = (covid['ds_upper'] - covid['ds']).dt.days

3.141592653589793

In [None]:
m = Prophet(holidays = covid, weekly_seasonality = USE_WEEK_SEASONALITY, changepoint_prior_scale = 0.005, changepoint_range=1, interval_width=CONFIDENCE_INTERVAL, uncertainty_samples = 10000)

if USE_HOLIDAYS:
    m.add_country_holidays(country_name='CH')
  
m.add_regressor("CosYearTemp", prior_scale = 0.05, standardize = True, mode = "additive")
m.add_regressor("SinYearTemp", prior_scale = 0.05, standardize = True, mode = "additive")
m.fit(train)

In [None]:
forecastTrain = m.predict(train)

In [None]:
fig = m.plot_components(forecastTrain)

In [None]:
fig = m.plot(forecastTrain)
a = add_changepoints_to_plot(fig.gca(), m, forecastTrain)

In [None]:
mse = mean_squared_error(train["y"].to_numpy(), forecastTrain["yhat"].to_numpy())
mse

In [None]:
rmse = math.sqrt(mse)
rmse

In [None]:
mean_absolute_percentage_error(train["y"].to_numpy(), forecastTrain["yhat"].to_numpy())

In [None]:
regressor_coefficients(m)

In [None]:
plotData = pd.DataFrame(
  {
    "Actual": train["y"].array,
    "PredictedMean": forecastTrain["yhat"].array,
    "PredictedHigh": forecastTrain["yhat_upper"].array,
    "PredictedLow": forecastTrain["yhat_lower"].array
  },
  index=train["ds"]
)

plotData.index = pd.to_datetime(plotData.index)
plotData = plotData.sort_index()

In [None]:
def plot(data):
  fig, ax = plt.subplots()
  fig.set_size_inches(12, 8)
  fig.patch.set_facecolor('white')
  ax.plot(data.index, data["Actual"])
  ax.fill_between(data.index, data["PredictedHigh"], data["PredictedLow"], color='orange', alpha=.2)

In [None]:
plot(plotData)

In [None]:
plot(plotData[(plotData.index >= datetime(2019, 1, 1)) & (plotData.index < datetime(2019, 3, 1))])

In [None]:
with open(MODEL_PATH + MODEL_NAME, 'w') as fout:
    fout.write(model_to_json(m))