# Forecasting

In [87]:
import boto3
import datetime
import io
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.units as munits
import numpy as np
from sklearn.metrics import mean_squared_error

In [88]:
# AWS credentials to read files on S3 bucket
f = open('../credentials.json')
credentials = json.load(f)

s3_client = boto3.client(
    "s3",
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
    )

s3_resource = boto3.resource(
    "s3",
    aws_access_key_id=credentials["Access key ID"],
    aws_secret_access_key=credentials["Secret access key"]
    )

In [100]:
model_name = {
    "rid": "Ridge",
    "mlp": "Multi-layer Perceptron",
    "gb": "Histogram-based Gradient Boosting Regression Tree",
}

# getting predictions data
prefix_objs = s3_resource.Bucket("cge").objects.filter(Prefix="output")
keys = [obj.key for obj in prefix_objs]
for key in keys[1:]:
    station = key.replace("output/", "").replace(".csv", "").split("_")[0]
    model = key.replace("output/", "").replace(".csv", "").split("_")[1]
    
    obj_true = s3_client.get_object(Bucket="cge", Key=f"processed/{station}.csv")
    obj_pred = s3_client.get_object(Bucket="cge", Key=key)

    obj_true_rural = s3_client.get_object(Bucket="cge", Key="processed/parelheiros.csv")
    obj_pred_rural = s3_client.get_object(Bucket="cge", Key=f"output/parelheiros_{model}.csv")
    
    y_true = pd.read_csv(io.BytesIO(obj_true["Body"].read()))
    y_true = y_true[["timestamp", "temperature"]].dropna()
    y_true["timestamp"] = pd.to_datetime(y_true["timestamp"])
    y_true = y_true.set_index("timestamp")

    y_pred = pd.read_csv(io.BytesIO(obj_pred["Body"].read()), index_col=0)
    station_name = y_pred.station_name.unique()[0]
    y_pred = y_pred.drop(["station", "station_name"], axis=1)
    y_pred["timestamp"] = pd.to_datetime(y_pred["timestamp"])

    y_true_rural = pd.read_csv(io.BytesIO(obj_true_rural["Body"].read()))
    y_true_rural = y_true_rural[["timestamp", "temperature"]].dropna()
    y_true_rural["timestamp"] = pd.to_datetime(y_true_rural["timestamp"])
    y_true_rural = y_true_rural.set_index("timestamp")

    y_pred_rural = pd.read_csv(io.BytesIO(obj_pred_rural["Body"].read()), index_col=0)
    y_pred_rural = y_pred_rural.drop(["station", "station_name"], axis=1)
    y_pred_rural["timestamp"] = pd.to_datetime(y_pred_rural["timestamp"])

    converter = mdates.ConciseDateConverter()
    munits.registry[np.datetime64] = converter
    munits.registry[datetime.date] = converter
    munits.registry[datetime.datetime] = converter

    mse = []
    for i in y_pred.index:
        pred = y_pred.loc[[i]]
        start_date = pred.timestamp.unique()[0] + np.timedelta64(1, "h")
        pred = pred.drop("timestamp", axis=1).T
        pred.index = pd.date_range(start=start_date, periods=6, freq="H")
        pred.columns = ["temperature"]

        pred_rural = y_pred_rural[y_pred_rural.timestamp == start_date]
        pred_rural = pred_rural.drop("timestamp", axis=1).T
        pred_rural.index = pd.date_range(start=start_date, periods=6, freq="H")
        pred_rural.columns = ["temperature_rural"]

        pred_uhii = pred.merge(pred_rural, left_index=True, right_index=True)
        pred_uhii["uhii"] = pred_uhii.temperature - pred_uhii.temperature_rural
        pred_uhii = pred_uhii.drop(["temperature", "temperature_rural"], axis=1)

        past_data = 24*2
        train_plot = y_true[pred.index[0]-np.timedelta64(past_data, "h"):pred.index[0]-np.timedelta64(1, "h")]
        train_plot_rural = y_true_rural[pred.index[0]-np.timedelta64(past_data, "h"):pred.index[0]-np.timedelta64(1, "h")]
        train_plot_rural.columns = ["temperature_rural"]
        train_plot_uhii = train_plot.merge(train_plot_rural, left_index=True, right_index=True)
        train_plot_uhii["uhii"] = train_plot_uhii.temperature - train_plot_uhii.temperature_rural
        train_plot_uhii = train_plot_uhii.drop(["temperature", "temperature_rural"], axis=1)

        test_plot = y_true[pred.index[0]:pred.index[-1]]
        test_plot_rural = y_true_rural[pred.index[0]:pred.index[-1]]
        test_plot_rural.columns = ["temperature_rural"]
        test_plot_uhii = test_plot.merge(test_plot_rural, left_index=True, right_index=True)
        test_plot_uhii["uhii"] = test_plot_uhii.temperature - test_plot_uhii.temperature_rural
        test_plot_uhii = test_plot_uhii.drop(["temperature", "temperature_rural"], axis=1)

        fig, ax = plt.subplots(figsize=(9,3))
        ax.plot(train_plot_uhii, label=f"Observed (last {past_data} hours)", marker=".", markersize=8, alpha=0.8)
        ax.plot(test_plot_uhii, label="Test", ls="", marker=".", markersize=8, color="forestgreen", alpha=0.8)
        ax.plot(pred_uhii, label="Predicted", ls="", marker="X", markersize=5, color="orangered", alpha=0.8)
        ax.set_ylabel("UHII (°C)")
        ax.legend()
        ax.grid(alpha=0.3)
        plt.title(f"Station: {station_name} / Model: {model_name[model]}")

        # uploading plots
        img_data = io.BytesIO()
        plt.savefig(img_data, format="png", dpi=300, bbox_inches="tight")
        img_data.seek(0)
        bucket = s3_resource.Bucket("cge")
        station_model_dt = station + "_" + model + "_" + str(pred_uhii.index[0]).replace("-", "").replace(" ", "").replace(":", "")
        bucket.put_object(Body=img_data, ContentType="image/png", Key=f"figures/{station}/{station_model_dt}")
        plt.close()

        try:
            mse.append(mean_squared_error(test_plot_uhii, pred_uhii))
        except:
            pass
        
    model_error = pd.DataFrame({
        "Station": [station_name],
        "Model": [model_name[model]],
        "MSE": [np.round(np.mean(mse), 2)],
        "MSE standard deviation": [np.round(np.std(mse), 2)]
        })

    # writing predictions to S3 bucket
    station_model = station + "_" + model
    buffer = io.StringIO()
    model_error.to_csv(buffer, index=False)
    s3_resource.Object("cge", f"evaluation/{station_model}.csv").put(Body=buffer.getvalue())