In [38]:
# Analysis of the output of experiments

In [60]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [61]:
import datetime
import os
import pathlib

import pandas as pd
import altair as alt
import numpy as np
import pvlib

import plotly.express as px

alt.data_transformers.disable_max_rows()


def _(df, *args, **kwargs):
    print(len(df))
    display(df.head(*args, **kwargs))

In [62]:
# It's always annoying to set the working directory: we use an environment variable defined in the Makefile.
CWD = os.environ.get("CWD")
if CWD:
    os.chdir(CWD)

In [63]:
%pwd

'/home/zak/pv-site-prediction'

In [64]:
def round_to(x, to):
    return round(x / to) * to

In [65]:
EXP_NAMES = ["exc_t_nwpF_excF",
            "exc_t_nwpT_excF",
            "exc_t_nwpF_excT",
            "exc_t_nwpT_excT"]

In [66]:
COLORS = [
    "#086788",
    "#4c9a8e",
    "#ff9736",
    "#ffd053",
    #    "#63bcaf",
    #    "#e4e4e4",
    #    "#ffac5f",
    "#7bcdf3",
    "#14120e",
]


REMOVE_WEIRD_PV_IDS = False

# This rounding is only used in some charts.
ROUND_HORIZON_TO = 4

ROUND_PRED_TO = 60
NIGHT_THRESHOLD = 0.0

NORMALIZE_BY = "capacity"
METRIC = "mae"

In [67]:
def load_exp(names):
    dfs = []
    for name in names:
        df = None
        for ext in [".csv.gz", ".csv"]:
            try:
                df = pd.read_csv(f"exp_results/{name}/test_errors{ext}")
            except FileNotFoundError as e:
                # print(e)
                continue

        if df is None:
            print(f"Could not find data for model {name}")
            continue
        df["model"] = name

        print(name)
        _(df)

        df = df[~df["y"].isna()]

        # Join the mean "y" per pv_id.
        mean_y = df[["y", "pv_id"]].groupby("pv_id").mean()  # )#df["y"].mean()
        mean_y.columns = ["mean_y"]
        df = df.join(mean_y, on="pv_id")

        # If a capacity is not already provided, define quantile(99%) as the capacity, per pv_id.
        if "capacity" not in df.columns:
            q99 = df[["y", "pv_id"]].groupby("pv_id").quantile(0.99)
            q99.columns = ["capacity"]
            df = df.join(q99, on="pv_id")

        dfs.append(df)

    df = pd.concat(dfs)
    return df


def augment_data(df):
    df["ts"] = pd.to_datetime(df["ts"])
    df = df[df["metric"] == "mae"]

    df["pred_ts"] = df["ts"] + pd.to_timedelta(df["horizon"], unit="minute")

    # We have heuristics to remove weird test PVs that have data at night.
    # In the `uk_pv` dataset there are quite a few of those.
    # Note that this is pretty much hard-coded for that use-case.
    if REMOVE_WEIRD_PV_IDS:
        night_mask = (df["pred_ts"].dt.hour < 1) | (df["pred_ts"].dt.hour > 20)
        bad_pvs = df[night_mask & df["y"] > 0]
        bad_pvs = bad_pvs[["pv_id", "y"]].groupby("pv_id").count().reset_index()
        bad_pvs = bad_pvs[bad_pvs["y"] > 10]
        # _(bad_pvs)

        remove_pvs = bad_pvs["pv_id"].unique()

        print(f"REMOVING PVS WITH NIGHT DATA: {remove_pvs}")
        df = df[~df["pv_id"].isin(remove_pvs)]

    if METRIC == "mae":
        pass
    elif METRIC == "mbe":
        # For mean bias error, recompute the errors as we don't compute that one by default.
        df["error"] = df["y"] - df["pred"]

    # df = df.join(inferred_meta[["factor", "capacity"]], on="pv_id")
    # df = df.join(meta[['latitude', 'longitude']], on="pv_id")

    # Normalizing by the average `y` values for each PV.
    if NORMALIZE_BY == "mean":
        df["weighted_error"] = df["error"] / df["mean_y"] * 100.0
    elif NORMALIZE_BY == "capacity":
        df["weighted_error"] = df["error"] / df["capacity"] * 100.0
    elif NORMALIZE_BY == "nothing":
        df["weighted_error"] = df["error"]
    else:
        raise ValueError("unknown NORMALIZE_BY value")

    df["horizon"] = df["horizon"] / 60.0
    df = df[~df["error"].isnull()]

    # Round the prediction hour and the horizon for more concise charts.
    df["pred_hour"] = df["pred_ts"].dt.hour * 60 + round_to(df["pred_ts"].dt.minute, ROUND_PRED_TO)

    df = df[df["y"] > NIGHT_THRESHOLD]

    return df

In [68]:
error_name = METRIC.upper()
if NORMALIZE_BY == "mean":
    error_name += " / avg  (%)"
elif NORMALIZE_BY == "capacity":
    error_name += " / capacity (%)"
elif NORMALIZE_BY == "nothing":
    error_name += " (MW)"
else:
    raise ValueError("unknown NORMALIZE_BY value")

In [69]:
df = load_exp(EXP_NAMES)
_(df, 20)
# _(df.sample(20), 20)
df = augment_data(df)
_(df, 20)
# _(df.sample(20), 20)

exc_t_nwpF_excF
96000


Unnamed: 0,pv_id,ts,ts_start,ts_end,metric,error,horizon,y,pred,train_date,capacity,model
0,0,2020-05-20 00:00:00,2020-05-20 00:00:00,2020-05-20 01:00:00,mae,0.005753,0,0.005753,0.0,2019-12-31,172.6125,exc_t_nwpF_excF
1,0,2020-05-20 00:00:00,2020-05-20 01:00:00,2020-05-20 02:00:00,mae,0.019286,60,0.019286,0.0,2019-12-31,172.6125,exc_t_nwpF_excF
2,0,2020-05-20 00:00:00,2020-05-20 02:00:00,2020-05-20 03:00:00,mae,1.432559,120,1.432559,0.0,2019-12-31,172.6125,exc_t_nwpF_excF
3,0,2020-05-20 00:00:00,2020-05-20 03:00:00,2020-05-20 04:00:00,mae,6.914863,180,6.914863,0.0,2019-12-31,172.6125,exc_t_nwpF_excF
4,0,2020-05-20 00:00:00,2020-05-20 04:00:00,2020-05-20 05:00:00,mae,5.422275,240,17.81311,12.390835,2019-12-31,172.6125,exc_t_nwpF_excF


exc_t_nwpT_excF
96000


Unnamed: 0,pv_id,ts,ts_start,ts_end,metric,error,horizon,y,pred,train_date,capacity,model
0,0,2020-05-20 00:00:00,2020-05-20 00:00:00,2020-05-20 01:00:00,mae,0.005753,0,0.005753,0.0,2019-12-31,172.6125,exc_t_nwpT_excF
1,0,2020-05-20 00:00:00,2020-05-20 01:00:00,2020-05-20 02:00:00,mae,0.019286,60,0.019286,0.0,2019-12-31,172.6125,exc_t_nwpT_excF
2,0,2020-05-20 00:00:00,2020-05-20 02:00:00,2020-05-20 03:00:00,mae,1.432559,120,1.432559,0.0,2019-12-31,172.6125,exc_t_nwpT_excF
3,0,2020-05-20 00:00:00,2020-05-20 03:00:00,2020-05-20 04:00:00,mae,6.914863,180,6.914863,0.0,2019-12-31,172.6125,exc_t_nwpT_excF
4,0,2020-05-20 00:00:00,2020-05-20 04:00:00,2020-05-20 05:00:00,mae,9.139593,240,17.81311,8.673517,2019-12-31,172.6125,exc_t_nwpT_excF


exc_t_nwpF_excT
96000


Unnamed: 0,pv_id,ts,ts_start,ts_end,metric,error,horizon,y,pred,train_date,capacity,model
0,0,2020-05-20 00:00:00,2020-05-20 00:00:00,2020-05-20 01:00:00,mae,0.005753,0,0.005753,0.0,2019-12-31,172.6125,exc_t_nwpF_excT
1,0,2020-05-20 00:00:00,2020-05-20 01:00:00,2020-05-20 02:00:00,mae,0.019286,60,0.019286,0.0,2019-12-31,172.6125,exc_t_nwpF_excT
2,0,2020-05-20 00:00:00,2020-05-20 02:00:00,2020-05-20 03:00:00,mae,1.432559,120,1.432559,0.0,2019-12-31,172.6125,exc_t_nwpF_excT
3,0,2020-05-20 00:00:00,2020-05-20 03:00:00,2020-05-20 04:00:00,mae,6.914863,180,6.914863,0.0,2019-12-31,172.6125,exc_t_nwpF_excT
4,0,2020-05-20 00:00:00,2020-05-20 04:00:00,2020-05-20 05:00:00,mae,8.798513,240,17.81311,9.014597,2019-12-31,172.6125,exc_t_nwpF_excT


exc_t_nwpT_excT
842112


Unnamed: 0,pv_id,ts,ts_start,ts_end,metric,error,horizon,y,pred,train_date,capacity,model
0,0,2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 01:00:00,mae,0.003363,0,0.003363,0.0,2019-12-31,153.761,exc_t_nwpT_excT
1,0,2020-01-01 00:00:00,2020-01-01 01:00:00,2020-01-01 02:00:00,mae,0.003309,60,0.003309,0.0,2019-12-31,153.761,exc_t_nwpT_excT
2,0,2020-01-01 00:00:00,2020-01-01 02:00:00,2020-01-01 03:00:00,mae,0.002838,120,0.002838,0.0,2019-12-31,153.761,exc_t_nwpT_excT
3,0,2020-01-01 00:00:00,2020-01-01 03:00:00,2020-01-01 04:00:00,mae,0.003544,180,0.003544,0.0,2019-12-31,153.761,exc_t_nwpT_excT
4,0,2020-01-01 00:00:00,2020-01-01 04:00:00,2020-01-01 05:00:00,mae,0.003119,240,0.003119,0.0,2019-12-31,153.761,exc_t_nwpT_excT


1129866


Unnamed: 0,pv_id,ts,ts_start,ts_end,metric,error,horizon,y,pred,train_date,capacity,model,mean_y
0,0,2020-05-20 00:00:00,2020-05-20 00:00:00,2020-05-20 01:00:00,mae,0.005753,0,0.005753,0.0,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747
1,0,2020-05-20 00:00:00,2020-05-20 01:00:00,2020-05-20 02:00:00,mae,0.019286,60,0.019286,0.0,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747
2,0,2020-05-20 00:00:00,2020-05-20 02:00:00,2020-05-20 03:00:00,mae,1.432559,120,1.432559,0.0,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747
3,0,2020-05-20 00:00:00,2020-05-20 03:00:00,2020-05-20 04:00:00,mae,6.914863,180,6.914863,0.0,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747
4,0,2020-05-20 00:00:00,2020-05-20 04:00:00,2020-05-20 05:00:00,mae,5.422275,240,17.81311,12.390835,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747
5,0,2020-05-20 00:00:00,2020-05-20 05:00:00,2020-05-20 06:00:00,mae,1.604284,300,36.10038,34.496096,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747
6,0,2020-05-20 00:00:00,2020-05-20 06:00:00,2020-05-20 07:00:00,mae,13.92442,360,60.96847,47.04405,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747
7,0,2020-05-20 00:00:00,2020-05-20 07:00:00,2020-05-20 08:00:00,mae,12.411248,420,84.63024,72.218992,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747
8,0,2020-05-20 00:00:00,2020-05-20 08:00:00,2020-05-20 09:00:00,mae,9.929763,480,103.488,93.558237,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747
9,0,2020-05-20 00:00:00,2020-05-20 09:00:00,2020-05-20 10:00:00,mae,16.067537,540,113.6683,97.600763,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747


1129803


Unnamed: 0,pv_id,ts,ts_start,ts_end,metric,error,horizon,y,pred,train_date,capacity,model,mean_y,pred_ts,weighted_error,pred_hour
0,0,2020-05-20,2020-05-20 00:00:00,2020-05-20 01:00:00,mae,0.005753,0.0,0.005753,0.0,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747,2020-05-20 00:00:00,0.003333,0.0
1,0,2020-05-20,2020-05-20 01:00:00,2020-05-20 02:00:00,mae,0.019286,1.0,0.019286,0.0,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747,2020-05-20 01:00:00,0.011173,60.0
2,0,2020-05-20,2020-05-20 02:00:00,2020-05-20 03:00:00,mae,1.432559,2.0,1.432559,0.0,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747,2020-05-20 02:00:00,0.829928,120.0
3,0,2020-05-20,2020-05-20 03:00:00,2020-05-20 04:00:00,mae,6.914863,3.0,6.914863,0.0,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747,2020-05-20 03:00:00,4.006004,180.0
4,0,2020-05-20,2020-05-20 04:00:00,2020-05-20 05:00:00,mae,5.422275,4.0,17.81311,12.390835,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747,2020-05-20 04:00:00,3.141299,240.0
5,0,2020-05-20,2020-05-20 05:00:00,2020-05-20 06:00:00,mae,1.604284,5.0,36.10038,34.496096,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747,2020-05-20 05:00:00,0.929413,300.0
6,0,2020-05-20,2020-05-20 06:00:00,2020-05-20 07:00:00,mae,13.92442,6.0,60.96847,47.04405,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747,2020-05-20 06:00:00,8.066867,360.0
7,0,2020-05-20,2020-05-20 07:00:00,2020-05-20 08:00:00,mae,12.411248,7.0,84.63024,72.218992,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747,2020-05-20 07:00:00,7.190237,420.0
8,0,2020-05-20,2020-05-20 08:00:00,2020-05-20 09:00:00,mae,9.929763,8.0,103.488,93.558237,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747,2020-05-20 08:00:00,5.752632,480.0
9,0,2020-05-20,2020-05-20 09:00:00,2020-05-20 10:00:00,mae,16.067537,9.0,113.6683,97.600763,2019-12-31,172.6125,exc_t_nwpF_excF,27.416747,2020-05-20 09:00:00,9.308443,540.0


In [70]:
df["model"] = pd.Categorical(df["model"], reversed(EXP_NAMES))
df = df.sort_values("model")

In [71]:
def print_means(df):
    mean_ = df.groupby(["model"])["weighted_error"].mean().to_dict()
    err = (
        df.groupby(["model"])["weighted_error"].std()
        * 1.96
        / np.sqrt(df.groupby(["model"])["weighted_error"].count())
    ).to_dict()

    print("Mean: ", error_name)
    for name in mean_:
        print(f"{name}: {mean_[name]:.2f} ± {err[name]:.2f}")

In [72]:
print_means(df)

Mean:  MAE / capacity (%)
exc_t_nwpT_excT: 2.33 ± 0.01
exc_t_nwpF_excT: 2.40 ± 0.03
exc_t_nwpT_excF: 2.58 ± 0.03
exc_t_nwpF_excF: 3.13 ± 0.04


In [73]:
color_scale = alt.Scale(range=COLORS)

In [74]:
gt = df[["y", "pred_hour"]].copy()
gt = gt.groupby(["pred_hour"]).mean().reset_index()
gt["pred_hour"] = pd.to_timedelta(gt["pred_hour"], unit="minute")
gt["date"] = pd.Timestamp(2023, 1, 1)
gt["pred_hour"] = gt["date"] + gt["pred_hour"]
gt["y"] /= 10
del gt["date"]

# _(df)
data = df[["model", "horizon", "weighted_error", "pred_hour"]].copy()
# data = data[(data['horizon'] % 4).isin([0, 1, 3])]
data["horizon"] = (data["horizon"] // 4) * 4
# data['pred_hour'] = round(data['pred_hour'] / 60) * 60
# _(data)
data = data.groupby(["model", "horizon", "pred_hour"]).agg(["mean", "std", "count"]).reset_index()
data["error"] = data[("weighted_error", "mean")]
err = 1.96 * data[("weighted_error", "std")] / data[("weighted_error", "count")].pow(0.5)
data["low"] = data[("weighted_error", "mean")] - err
data["high"] = data[("weighted_error", "mean")] + err
del data["weighted_error"]
data.columns = data.columns.get_level_values(0)
# _(data)

data["pred_hour"] = pd.to_timedelta(data["pred_hour"], unit="minute")
data["date"] = pd.Timestamp(2023, 1, 1)
data["pred_hour"] = data["date"] + data["pred_hour"]

# del data['pred_hour']
del data["date"]
data = data.sort_values(["model", "pred_hour", "horizon"])

h = data["horizon"].astype(int)
data["h_label"] = "[ " + h.astype(str) + ", " + (h + 4).astype(str) + " ["
_(data)

c = alt.Chart()

line = c.mark_line(point=True).encode(
    x=alt.X("hoursminutes(pred_hour)", title="Prediction time of day"),
    y=alt.Y("error", title=error_name),
    color=alt.Color("model:N", title="Model", sort=EXP_NAMES, scale=color_scale),
    # facet=alt.Facet("horizon:O", title='Horizon', spacing=10, columns=5),
)

error = c.mark_errorband().encode(
    x=alt.X("hoursminutes(pred_hour)", title=""),
    y=alt.Y("high", title=""),
    y2=alt.Y2("low", title=""),
    color=alt.Color("model", sort=EXP_NAMES, scale=color_scale),
)

# gt_chart = (
#     alt.Chart(gt)
#     .mark_line(color='black')
#     .encode(
#         x='hoursminutes(pred_hour)',
#         y='y',
#     )
# )

c = (
    alt.layer(line, error, data=data)
    .properties(
        height=80,
        width=140,
    )
    .facet(alt.Facet("h_label", title="Horizon", sort=alt.EncodingSortField("horizon")), columns=4)
    .configure_point(size=0)
)
c

1152


Unnamed: 0,model,horizon,pred_hour,error,low,high,h_label
0,exc_t_nwpT_excT,0.0,2023-01-01,0.002603,0.002569,0.002638,"[ 0, 4 ["
24,exc_t_nwpT_excT,4.0,2023-01-01,0.002605,0.00257,0.002639,"[ 4, 8 ["
48,exc_t_nwpT_excT,8.0,2023-01-01,0.002605,0.00257,0.002639,"[ 8, 12 ["
72,exc_t_nwpT_excT,12.0,2023-01-01,0.002605,0.00257,0.002639,"[ 12, 16 ["
96,exc_t_nwpT_excT,16.0,2023-01-01,0.002605,0.00257,0.002639,"[ 16, 20 ["


In [75]:
data = df[df["model"] == EXP_NAMES[0]].copy()
data = data[["horizon", "weighted_error", "pred_hour"]]

data["horizon"] = (data["horizon"] // 4) * 4

data = data.groupby(["horizon", "pred_hour"]).agg(["mean", "std", "count"]).reset_index()

data["error"] = data[("weighted_error", "mean")]
err = 1.96 * data[("weighted_error", "std")] / data[("weighted_error", "count")].pow(0.5)
data["low"] = data[("weighted_error", "mean")] - err
data["high"] = data[("weighted_error", "mean")] + err
del data["weighted_error"]
data.columns = data.columns.get_level_values(0)

data["pred_hour"] = pd.to_timedelta(data["pred_hour"], unit="minute")
data["date"] = pd.Timestamp(2023, 1, 1)
data["pred_hour"] = data["date"] + data["pred_hour"]

del data["date"]

h = data["horizon"].astype(int)
data["h_label"] = "[ " + h.astype(str) + ", " + (h + 4).astype(str) + " ["
_(data)

c = alt.Chart()

line = c.mark_line(point=True).encode(
    x=alt.X("hoursminutes(pred_hour)", title="Prediction time of day"),
    y=alt.Y("error", title=error_name),
    color=alt.Color("horizon"),
)

error = c.mark_errorband().encode(
    x=alt.X("hoursminutes(pred_hour)", title=""),
    y=alt.Y("high", title=""),
    y2=alt.Y2("low", title=""),
    color=alt.Color("horizon"),
)

# gt_chart = (
#     alt.Chart(gt)
#     .mark_line(color='black')
#     .encode(
#         x='hoursminutes(pred_hour)',
#         y='y',
#     )
# )

c = (
    alt.layer(line, data=data)
    .properties(
        height=200,
        width=500,
    )
    .configure_point(size=0)
)
c

288


Unnamed: 0,horizon,pred_hour,error,low,high,h_label
0,0.0,2023-01-01 00:00:00,0.002635,0.002533,0.002737,"[ 0, 4 ["
1,0.0,2023-01-01 01:00:00,0.003805,0.003368,0.004242,"[ 0, 4 ["
2,0.0,2023-01-01 02:00:00,0.145329,0.112846,0.177813,"[ 0, 4 ["
3,0.0,2023-01-01 03:00:00,0.79305,0.648164,0.937936,"[ 0, 4 ["
4,0.0,2023-01-01 04:00:00,1.805851,1.544765,2.066937,"[ 0, 4 ["


In [76]:
data = df[["model", "horizon", "weighted_error"]].copy()
data["horizon"] = round_to(data["horizon"], 1)
data = data.groupby(["model", "horizon"]).agg(["mean", "std", "count"]).reset_index()

data["error"] = data[("weighted_error", "mean")]
err = 1.96 * data[("weighted_error", "std")] / data[("weighted_error", "count")].pow(0.5)
data["high"] = data["error"] + err
data["low"] = data["error"] - err

data.columns = data.columns.get_level_values(0)

_(data)


line = (
    alt.Chart(data[["model", "error", "horizon"]])
    .mark_line(interpolate="step-after", point=True)
    .encode(
        y=alt.Y("error", title=error_name, scale=alt.Scale(zero=False)),
        color=alt.Color("model", sort=EXP_NAMES, scale=color_scale),
        x=alt.X("horizon:O", title="Horizon"),
    )
)

error = (
    alt.Chart(data[["model", "horizon", "high", "low"]])
    .mark_errorband(interpolate="step-after", opacity=0.15)
    .encode(
        x="horizon:O",
        y2="low",
        y=alt.Y("high", title=""),
        color=alt.Color("model", sort=EXP_NAMES, scale=color_scale),
    )
)

c = (
    alt.layer(line, error).properties(
        height=250,
        width=700,
    )
    # hack to get the opacity=1 legend without the points!
    .configure_point(size=0)
)

c

192


Unnamed: 0,model,horizon,weighted_error,weighted_error.1,weighted_error.2,error,high,low
0,exc_t_nwpT_excT,0.0,2.20216,4.003531,17539,2.20216,2.261412,2.142909
1,exc_t_nwpT_excT,1.0,2.198525,4.006891,17539,2.198525,2.257826,2.139224
2,exc_t_nwpT_excT,2.0,2.208964,4.033839,17539,2.208964,2.268664,2.149264
3,exc_t_nwpT_excT,3.0,2.217144,4.041445,17539,2.217144,2.276956,2.157332
4,exc_t_nwpT_excT,4.0,2.221122,4.040279,17539,2.221122,2.280917,2.161327


In [77]:
data = df[["model", "pred_ts", "weighted_error"]].copy()

data["year_month"] = data["pred_ts"].dt.strftime("%Y-%m")
del data["pred_ts"]
data = data
data = data.groupby(["model", "year_month"]).agg(["mean", "std", "count"]).reset_index()

data["error"] = data[("weighted_error", "mean")]
err = 1.96 * data[("weighted_error", "std")] / data[("weighted_error", "count")].pow(0.5)
data["high"] = data["error"] + err
data["low"] = data["error"] - err

data.columns = data.columns.get_level_values(0)

del data["weighted_error"]

_(data)
data.to_csv("patate.csv")

line = (
    alt.Chart(data[["error", "year_month", "model"]])
    .mark_line(point=True)
    .encode(
        x=alt.X("year_month:N", title=None, axis=alt.Axis(labelAngle=-45, labelFontSize=14)),
        y=alt.Y("error", title=error_name, scale=alt.Scale(zero=True)),
        color=alt.Color("model", sort=EXP_NAMES, scale=color_scale),
    )
)

error = (
    alt.Chart(data[["model", "year_month", "low", "high"]])
    .mark_errorband(opacity=0.2)
    .encode(
        x="year_month",
        y=alt.Y("low", title=""),
        y2="high",
        color=alt.Color("model", sort=EXP_NAMES, scale=color_scale),
    )
)


(line + error).properties(height=200, width=700)

100


Unnamed: 0,model,year_month,error,high,low
0,exc_t_nwpT_excT,2020-01,1.999477,2.043185,1.95577
1,exc_t_nwpT_excT,2020-02,2.21656,2.263288,2.169832
2,exc_t_nwpT_excT,2020-03,3.027054,3.084449,2.969659
3,exc_t_nwpT_excT,2020-04,2.950936,3.001966,2.899905
4,exc_t_nwpT_excT,2020-05,2.922507,2.9733,2.871715


In [78]:
# Same but per PV id.

data = df[["model", "pred_ts", "weighted_error", "pv_id"]].copy()

data["year_month"] = data["pred_ts"].dt.strftime("%Y-%m")
del data["pred_ts"]
data = data
data = data.groupby(["model", "year_month", "pv_id"]).agg(["mean", "std", "count"]).reset_index()

data["error"] = data[("weighted_error", "mean")]
err = 1.96 * data[("weighted_error", "std")] / data[("weighted_error", "count")].pow(0.5)
data["high"] = data["error"] + err
data["low"] = data["error"] - err

data.columns = data.columns.get_level_values(0)

del data["weighted_error"]

_(data)
# data.to_csv("patate.csv")

line = (
    alt.Chart()
    .mark_line(point=True)
    .encode(
        x=alt.X("year_month:N", title=None, axis=alt.Axis(labelAngle=-45, labelFontSize=14)),
        y=alt.Y("error", title=error_name, scale=alt.Scale(zero=True)),
        color=alt.Color("model", sort=EXP_NAMES, scale=color_scale),
    )
)

error = (
    alt.Chart()
    .mark_errorband(opacity=0.2)
    .encode(
        x="year_month",
        y=alt.Y("low", title=""),
        y2="high",
        color=alt.Color("model", sort=EXP_NAMES, scale=color_scale),
    )
)

(
    alt.layer(line, error, data=data)
    .properties(height=100, width=300)
    .facet(alt.Facet("pv_id"), columns=2)
)

100


Unnamed: 0,model,year_month,pv_id,error,high,low
0,exc_t_nwpT_excT,2020-01,0,1.999477,2.043185,1.95577
1,exc_t_nwpT_excT,2020-02,0,2.21656,2.263288,2.169832
2,exc_t_nwpT_excT,2020-03,0,3.027054,3.084449,2.969659
3,exc_t_nwpT_excT,2020-04,0,2.950936,3.001966,2.899905
4,exc_t_nwpT_excT,2020-05,0,2.922507,2.9733,2.871715


In [79]:
_(df)

scale = alt.Scale()

max_ = df["y"].quantile(0.99)

data = df.sample(10000)

points = (
    alt.Chart()
    .mark_circle(opacity=0.8)
    .encode(
        x=alt.X("y", scale=scale, title="ground truth"),
        y=alt.Y("pred", scale=scale, title="Prediction"),
        color=alt.Color("model", scale=color_scale, sort=EXP_NAMES),
    )
    # .properties(width=400, height=400)
)

line = (
    alt.Chart(pd.DataFrame(dict(x=[0, max_], y=[0, max_])))
    .mark_line(color="black", size=1)
    .encode(x="x", y="y")
)

(
    alt.layer(points, line, data=data)
    .properties(width=200, height=200)
    .facet("model", columns=np.ceil(np.sqrt(len(EXP_NAMES))))
)

1129803


Unnamed: 0,pv_id,ts,ts_start,ts_end,metric,error,horizon,y,pred,train_date,capacity,model,mean_y,pred_ts,weighted_error,pred_hour
277018,0,2020-08-28 11:00:00,2020-08-28 21:00:00,2020-08-28 22:00:00,mae,0.005334,10.0,0.005334,0.0,2019-12-31,177.1648,exc_t_nwpT_excT,27.659856,2020-08-28 21:00:00,0.003011,1260.0
465406,0,2021-02-07 23:00:00,2021-02-09 21:00:00,2021-02-09 22:00:00,mae,0.010631,46.0,0.010631,0.0,2019-12-31,186.7357,exc_t_nwpT_excT,27.659856,2021-02-09 21:00:00,0.005693,1260.0
465407,0,2021-02-07 23:00:00,2021-02-09 22:00:00,2021-02-09 23:00:00,mae,0.021043,47.0,0.021043,0.0,2019-12-31,186.7357,exc_t_nwpT_excT,27.659856,2021-02-09 22:00:00,0.011269,1320.0
465408,0,2021-02-08 00:00:00,2021-02-08 00:00:00,2021-02-08 01:00:00,mae,0.004174,0.0,0.004174,0.0,2019-12-31,186.7357,exc_t_nwpT_excT,27.659856,2021-02-08 00:00:00,0.002235,0.0
465409,0,2021-02-08 00:00:00,2021-02-08 01:00:00,2021-02-08 02:00:00,mae,0.005193,1.0,0.005193,0.0,2019-12-31,186.7357,exc_t_nwpT_excT,27.659856,2021-02-08 01:00:00,0.002781,60.0


In [80]:
# Error distribution

data = df[["y", "pred", "mean_y", "model"]].copy()
data["error"] = (df["pred"] - df["y"]) / df["mean_y"]
_(data)

scale = alt.Scale()  # type='log')
# df = df[df['y'] > 0]
# df = df[df['pred'] > 0]

chart = (
    alt.Chart(data.sample(10000))
    .encode(
        x=alt.X("error", scale=scale, title="Error", bin=alt.Bin(maxbins=100)),
        y=alt.Y("count()", stack=False),
        color=alt.Color("model", scale=color_scale, sort=EXP_NAMES),
    )
    .properties(width=700, height=400)
)


line = alt.Chart(pd.DataFrame(dict(x=[0]))).mark_rule().encode(x="x")

(chart.mark_line(point=True) + chart.mark_area(opacity=0.1) + line).configure_point(size=0)

1129803


Unnamed: 0,y,pred,mean_y,model,error
277018,0.005334,0.0,27.659856,exc_t_nwpT_excT,-0.000193
465406,0.010631,0.0,27.659856,exc_t_nwpT_excT,-0.000384
465407,0.021043,0.0,27.659856,exc_t_nwpT_excT,-0.000761
465408,0.004174,0.0,27.659856,exc_t_nwpT_excT,-0.000151
465409,0.005193,0.0,27.659856,exc_t_nwpT_excT,-0.000188
