# Forecasting

In [3]:
import boto3
import datetime
import io
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.units as munits
import numpy as np
from unidecode import unidecode
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.multioutput import RegressorChain
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import HalvingGridSearchCV

In [4]:
def make_steps(
    df : pd.DataFrame, 
    y : str, 
    steps : int
    ) -> pd.DataFrame:
    """
    Get target feature steps ahead.

    Parameters
    ----------
    df : pandas data frame
        Data frame with target feature.
    
    y : str
        Target feature name.
    
    steps : int
        Steps to forecast.

    Returns
    -------
    df : pandas data frame
        Data frame with target features.
    """

    for i in range(steps):
        df[f"{y}_step_{i+1}"] = df[y].shift(-i+1)
    return df


# models
rid = RegressorChain(
    base_estimator=RidgeCV(
        alphas=[1e-3, 1e-2, 1e-1, 1],
        cv=TimeSeriesSplit(n_splits=3, test_size=2000),
    ),
    random_state=123
)

gb = RegressorChain(
    base_estimator=HalvingGridSearchCV(
        estimator=HistGradientBoostingRegressor(random_state=123),
        param_grid = {
            "max_depth": np.arange(10, 40, 10, dtype=int),
            "learning_rate": np.logspace(-3, -1, 3)
        },
        cv=TimeSeriesSplit(n_splits=3, test_size=2000),
        aggressive_elimination=True,
        scoring="neg_mean_squared_error",
        random_state=123,
        n_jobs=-1
    ),
    random_state=123
)

mlp = RegressorChain(
    base_estimator=HalvingGridSearchCV(
        estimator=MLPRegressor(max_iter=1000, random_state=123),
        param_grid = {
            "hidden_layer_sizes": [(100,), (100, 75, 25)],
            "activation": ["identity", "logistic", "tanh", "relu"]
        },
        cv=TimeSeriesSplit(n_splits=3, test_size=2000),
        aggressive_elimination=True,
        scoring="neg_mean_squared_error",
        random_state=123,
        n_jobs=-1
    ),
    random_state=123
)

models = {
    "rid": rid,
    "mlp": mlp,
    "gb": gb,
}

In [5]:
# AWS credentials to read files on S3 bucket
f = open('../credentials.json')
credentials = json.load(f)

s3_client = boto3.client(
    "s3",
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
    )

s3_resource = boto3.resource(
    "s3",
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
    )

In [6]:
# getting preprocessed data
prefix_objs = s3_resource.Bucket("cge").objects.filter(Prefix="processed")
keys = [obj.key for obj in prefix_objs]
# for key in keys:
#     obj = s3_client.get_object(Bucket="cge", Key=key)
#     df = pd.read_csv(io.BytesIO(obj["Body"].read()))
#     df = df.reset_index(drop=True)
#     station = df.station.unique()[0]
#     station_name = df.station_name.unique()[0]
#     ts = df[["timestamp"]]
#     df = df.drop(["station", "station_name", "timestamp"], axis=1)

#     df = df.dropna(axis=1, how="all")
#     df = df.dropna()
    
#     y = make_steps(df=df[["temperature"]], y="temperature", steps=6).drop("temperature", axis=1)
#     y = y.dropna()

#     X = df.drop(["temperature"], axis=1)
#     X = X.loc[y.index.min():y.index.max()]

#     scaler = StandardScaler()
#     scaler.fit(X)
#     X_standard = scaler.transform(X)

#     test_size = 0.30
#     X_train_ref, X_test_ref, _, _ = train_test_split(X, y, test_size=0.30, shuffle=False)
#     X_train, X_test, y_train, y_test = train_test_split(X_standard, y, test_size=test_size, shuffle=False)

#     models["rid"].fit(X_train, y_train)
#     y_pred = pd.DataFrame(models["rid"].predict(X_test), index=X_test_ref.index, columns=y.columns)
#     ts = ts.loc[y_pred.index.min():y_pred.index.max()]
#     y_pred["timestamp"] = ts.timestamp 
#     y_pred["station"] = station
#     y_pred["station_name"] = station_name

#     # writing predictions to S3 bucket
#     station_ = unidecode(key.lower().replace(" ", "_").replace("processed/", ""))
#     buffer = io.StringIO()
#     y_pred.to_csv(buffer)
#     s3_resource.Object("cge", f"output/{station_}").put(Body=buffer.getvalue())


In [None]:
test = cge[X_test_ref.index[0]:X_test_ref.index[-1]][["timestamp", "temperature"]].set_index("timestamp")
test.plot()

In [None]:
test.index = pd.to_datetime(test.index)
test

In [None]:
ts.timestamp = pd.to_datetime(ts.timestamp)
y_pred["timestamp"] = ts[y_pred.index[0]:y_pred.index[-1]]["timestamp"]
y_pred = y_pred.dropna()
y_pred

In [None]:
converter = mdates.ConciseDateConverter()
munits.registry[np.datetime64] = converter
munits.registry[datetime.date] = converter
munits.registry[datetime.datetime] = converter

for i in y_pred.index[-200:-20]:
    pred = y_pred.loc[[i]]
    start_date = pred.timestamp.unique()[0] + np.timedelta64(1,'h')
    pred = pred.drop("timestamp", axis=1).T
    pred.index = pd.date_range(start=start_date, periods=6, freq="H")
    pred.columns = ["temperature"]

    past_data = 12*3
    train_plot = test[pred.index[0]-np.timedelta64(past_data,'h'):pred.index[0]-np.timedelta64(1,'h')]
    test_plot = test[pred.index[0]:pred.index[-1]]

    fig, ax = plt.subplots(figsize=(9,3))
    ax.plot(train_plot, label=f"Observed (last {past_data} hours)", marker=".", markersize=10, alpha=0.8)
    ax.plot(test_plot, label="Test", ls="", marker=".", markersize=10, color="forestgreen", alpha=0.8)
    ax.plot(pred, label="Predicted", ls="", marker="X", markersize=6, color="orangered", alpha=0.8)
    ax.legend()
    ax.grid(alpha=0.3)

mse = []
for i in y_pred.index:
    pred = y_pred.loc[[i]]
    start_date = pred.timestamp.unique()[0] + np.timedelta64(1,'h')
    pred = pred.drop("timestamp", axis=1).T
    pred.index = pd.date_range(start=start_date, periods=6, freq="H")
    pred.columns = ["temperature"]
    test_plot = test[pred.index[0]:pred.index[-1]]
    try:
        mse.append(mean_squared_error(test_plot, pred, squared=False))
    except:
        pass

In [None]:
print(f"MSE")
print(f"Mean: {np.round(np.mean(mse), 2)}") 
print(f"Standard deviation: +-{np.round(np.std(mse), 2)}")