# Forecast Analysis
## Imports

In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sts

from io import StringIO

import os
import datetime

## Load data

In [2]:
prediction_data = pd.DataFrame()

for file in os.listdir("prediction-data"):
    if "024" in file: # ONLY RUN ON 24 HOURS
        with open(f"prediction-data/{file}") as f:
            data = f.read()
            curr_data = pd.read_csv(StringIO(data), low_memory=False, sep=" ")
            prediction_data = pd.concat([prediction_data, curr_data])
    
prediction_data.columns = prediction_data.columns.str.strip()
prediction_data.set_index("date", inplace=True)
prediction_data.index = pd.to_datetime(prediction_data.index)
prediction_data: pd.DataFrame = prediction_data.apply(pd.to_numeric, errors='coerce')
prediction_data.loc[:, "loc_nr"] = prediction_data["loc_nr"].astype(str).str.slice(1).astype(int)
prediction_data.set_index("loc_nr", append=True, inplace=True)
prediction_data.loc[:, "mean_pred"] = prediction_data[[f"E{i + 1}" for i in range(50)]].mean(axis=1)
prediction_data.to_hdf("prediction_data.hdf5", "prediction_data")
prediction_data

Unnamed: 0_level_0,Unnamed: 1_level_0,det_run,E1,E2,E3,E4,E5,E6,E7,E8,E9,...,E43,E44,E45,E46,E47,E48,E49,E50,E51,mean_pred
date,loc_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2016-04-01,260,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0.16
2016-04-02,260,1,1,5,2,3,3,1,1,2,1,...,1,2,1,0,3,0,2,0,3,2.66
2016-04-03,260,34,20,8,22,19,26,19,28,12,25,...,12,10,37,9,17,38,11,19,13,25.90
2016-04-04,260,17,44,54,29,29,44,54,9,61,22,...,42,88,16,38,46,19,60,7,66,33.40
2016-04-05,260,30,18,8,24,58,36,41,30,43,25,...,40,8,51,52,18,30,30,15,33,31.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-26,240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.00
2021-04-27,240,0,0,0,0,2,0,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0.36
2021-04-28,240,41,27,10,38,29,5,31,14,32,20,...,48,33,11,19,30,13,24,3,10,22.82
2021-04-29,240,95,84,51,121,47,59,103,97,41,85,...,97,50,118,115,101,123,85,131,72,85.94


In [21]:
true_weather = pd.read_hdf("weather_data.hdf5", "measured_data")
rainfall: pd.DataFrame = true_weather["RH-fix"]
rainfall = rainfall.groupby([pd.Grouper(freq="D", level=0), rainfall.index.get_level_values(1)]).sum()
daily_rainfall = rainfall.rolling(5).sum()
daily_rainfall.index.rename(["date", "loc_nr"], inplace=True)
daily_rainfall.index = pd.MultiIndex.from_arrays([daily_rainfall.index.get_level_values(0) - datetime.timedelta(days=5), daily_rainfall.index.get_level_values(1)])
daily_rainfall = daily_rainfall * 10
daily_rainfall

date        loc_nr
1950-12-27  240         NaN
            260         NaN
            310         NaN
1950-12-28  240         NaN
            260        48.0
                      ...  
2022-05-31  260       546.0
            310       755.0
2022-06-01  240       799.0
            260       848.0
            310       735.0
Name: RH-fix, Length: 78267, dtype: float64

## Compare data

In [22]:
combined_data = prediction_data.join(daily_rainfall)
combined_data.loc[:, "det-difference"] = combined_data["RH-fix"] - combined_data["det_run"]
combined_data.loc[:, "pred-difference"] = combined_data["RH-fix"] - combined_data["mean_pred"]
combined_data

Unnamed: 0_level_0,Unnamed: 1_level_0,det_run,E1,E2,E3,E4,E5,E6,E7,E8,E9,...,E46,E47,E48,E49,E50,E51,mean_pred,RH-fix,det-difference,pred-difference
date,loc_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2016-04-01,260,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.16,129.0,129.0,128.84
2016-04-02,260,1,1,5,2,3,3,1,1,2,1,...,0,3,0,2,0,3,2.66,185.0,184.0,182.34
2016-04-03,260,34,20,8,22,19,26,19,28,12,25,...,9,17,38,11,19,13,25.90,74.0,40.0,48.10
2016-04-04,260,17,44,54,29,29,44,54,9,61,22,...,38,46,19,60,7,66,33.40,22.0,5.0,-11.40
2016-04-05,260,30,18,8,24,58,36,41,30,43,25,...,52,18,30,30,15,33,31.30,87.0,57.0,55.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-26,240,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.00,268.0,268.0,268.00
2021-04-27,240,0,0,0,0,2,0,0,1,0,1,...,0,0,0,0,0,1,0.36,3.0,3.0,2.64
2021-04-28,240,41,27,10,38,29,5,31,14,32,20,...,19,30,13,24,3,10,22.82,7.0,-34.0,-15.82
2021-04-29,240,95,84,51,121,47,59,103,97,41,85,...,115,101,123,85,131,72,85.94,116.0,21.0,30.06


In [23]:
smse_det = np.sqrt((combined_data["det-difference"] ** 2).mean())
smse_pred = np.sqrt((combined_data["pred-difference"] ** 2).mean())
f"{smse_det=}, {smse_pred=}"

'smse_det=176.44352501640756, smse_pred=175.75423280301402'

In [27]:
mae_det = np.abs(combined_data["det-difference"]).mean()
mae_pred = np.abs(combined_data["pred-difference"]).mean()
f"{mae_det=}, {mae_pred=}"

'mae_det=101.39748427672959, mae_pred=100.71249236298304'

In [45]:
smae_det = (np.abs(combined_data["det-difference"])/((np.abs(combined_data["det_run"]) + np.abs(combined_data["RH-fix"])/2))).mean()
smae_pred = (np.abs(combined_data["pred-difference"])/((np.abs(combined_data["mean_pred"]) + np.abs(combined_data["RH-fix"])/2))).mean()
f"{smae_det=}, {smae_pred=}"

'smae_det=1.3610628693789328, smae_pred=1.2740010620746915'

Symmetric Mean Absolute (geen percentage) Error SMAE