# Forecast Analysis
## Imports

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sts

from io import StringIO

import os
import datetime

## Load data

In [6]:
prediction_data = pd.DataFrame()

for file in os.listdir("prediction-data"):
    if "120" in file: # ONLY RUN ON 24 HOURS
        with open(f"prediction-data/{file}") as f:
            data = f.read()
            curr_data = pd.read_csv(StringIO(data), low_memory=False, sep=" ")
            prediction_data = pd.concat([prediction_data, curr_data])
    
prediction_data.columns = prediction_data.columns.str.strip()
prediction_data.set_index("date", inplace=True)
prediction_data.index = pd.to_datetime(prediction_data.index)
prediction_data: pd.DataFrame = prediction_data.apply(pd.to_numeric, errors='coerce')
prediction_data.loc[:, "loc_nr"] = prediction_data["loc_nr"].astype(str).str.slice(1).astype(int)
prediction_data.set_index("loc_nr", append=True, inplace=True)
prediction_data.loc[:, "mean_pred"] = prediction_data[[f"E{i + 1}" for i in range(50)]].mean(axis=1)
prediction_data.loc[:, "median_pred"] = prediction_data[[f"E{i + 1}" for i in range(50)]].median(axis=1)
prediction_data.loc[:, "mode_pred"] = prediction_data[[f"E{i + 1}" for i in range(50)]].mode(axis=1, numeric_only=True).mean(axis=1)
prediction_data.loc[:, "mode2_pred"] = prediction_data[[f"E{i + 1}" for i in range(50)]].mode(axis=1, numeric_only=True).median(axis=1)
prediction_data.loc[:, "mean_det_pred"] = prediction_data[["mean_pred", "det_run"]].mean(axis=1)
prediction_data.to_hdf("prediction_data.hdf5", "prediction_data_120h")
prediction_data

Unnamed: 0_level_0,Unnamed: 1_level_0,det_run,E1,E2,E3,E4,E5,E6,E7,E8,E9,...,E47,E48,E49,E50,E51,mean_pred,median_pred,mode_pred,mode2_pred,mean_det_pred
date,loc_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2016-04-05,260,1,0,5,0,0,14,132,2,7,22,...,0,23,16,1,1,23.90,13.0,0.0,0.0,12.45
2016-04-06,260,15,18,2,35,12,4,1,12,3,27,...,1,11,15,1,23,13.42,11.0,1.0,1.0,14.21
2016-04-07,260,70,29,35,57,5,31,46,29,76,7,...,4,1,34,40,24,32.38,25.0,7.0,7.0,51.19
2016-04-08,260,8,5,25,2,0,26,1,7,34,48,...,7,2,54,12,1,13.04,6.5,2.0,2.0,10.52
2016-04-09,260,36,21,2,6,42,20,5,0,4,79,...,59,1,7,27,8,21.46,17.0,8.0,5.0,28.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-30,240,11,26,74,3,8,25,26,44,30,7,...,18,40,61,1,64,17.30,11.0,7.0,7.0,14.15
2021-05-01,240,0,0,9,1,1,1,1,0,12,6,...,8,11,0,0,8,6.76,3.0,0.0,0.0,3.38
2021-05-02,240,7,18,9,7,3,10,3,0,27,5,...,12,0,0,8,17,8.14,6.0,1.5,1.5,7.57
2021-05-03,240,17,18,17,16,25,20,33,11,12,33,...,26,40,47,22,1,27.94,20.0,11.0,11.0,22.47


In [7]:
prediction_data[[f"E{i + 1}" for i in range(50)]].mode(axis=1).mean(axis=1)

date        loc_nr
2016-04-05  260         0.0
2016-04-06  260         1.0
2016-04-07  260         7.0
2016-04-08  260         2.0
2016-04-09  260         8.0
                      ...  
2021-04-30  240         7.0
2021-05-01  240         0.0
2021-05-02  240         1.5
2021-05-03  240        11.0
2021-05-04  240       117.4
Length: 5565, dtype: float64

In [8]:
true_weather = pd.read_hdf("weather_data.hdf5", "measured_data")
rainfall: pd.DataFrame = true_weather["RH-fix"]
rainfall = rainfall.groupby([pd.Grouper(freq="D", level=0), rainfall.index.get_level_values(1)]).sum()
daily_rainfall = rainfall.rolling(5).sum()
daily_rainfall.index.rename(["date", "loc_nr"], inplace=True)
daily_rainfall.index = pd.MultiIndex.from_arrays([daily_rainfall.index.get_level_values(0) - datetime.timedelta(days=5), daily_rainfall.index.get_level_values(1)])
daily_rainfall = daily_rainfall * 10
daily_rainfall

date        loc_nr
1950-12-27  240         NaN
            260         NaN
            310         NaN
1950-12-28  240         NaN
            260        48.0
                      ...  
2022-05-31  260       546.0
            310       755.0
2022-06-01  240       799.0
            260       848.0
            310       735.0
Name: RH-fix, Length: 78267, dtype: float64

## Compare data

In [9]:
combined_data = prediction_data.join(daily_rainfall)
combined_data.loc[:, "det-difference"] = combined_data["RH-fix"] - combined_data["det_run"]
combined_data.loc[:, "pred-difference"] = combined_data["RH-fix"] - combined_data["mean_pred"]
combined_data.loc[:, "med-difference"] = combined_data["RH-fix"] - combined_data["median_pred"]
combined_data.loc[:, "mode-difference"] = combined_data["RH-fix"] - combined_data["mode_pred"]
combined_data.loc[:, "mode2-difference"] = combined_data["RH-fix"] - combined_data["mode2_pred"]
combined_data.loc[:, "dp-difference"] = combined_data["RH-fix"] - combined_data["mean_det_pred"]
combined_data

Unnamed: 0_level_0,Unnamed: 1_level_0,det_run,E1,E2,E3,E4,E5,E6,E7,E8,E9,...,mode_pred,mode2_pred,mean_det_pred,RH-fix,det-difference,pred-difference,med-difference,mode-difference,mode2-difference,dp-difference
date,loc_nr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2016-04-05,260,1,0,5,0,0,14,132,2,7,22,...,0.0,0.0,12.45,87.0,86.0,63.10,74.0,87.0,87.0,74.55
2016-04-06,260,15,18,2,35,12,4,1,12,3,27,...,1.0,1.0,14.21,45.0,30.0,31.58,34.0,44.0,44.0,30.79
2016-04-07,260,70,29,35,57,5,31,46,29,76,7,...,7.0,7.0,51.19,41.0,-29.0,8.62,16.0,34.0,34.0,-10.19
2016-04-08,260,8,5,25,2,0,26,1,7,34,48,...,2.0,2.0,10.52,67.0,59.0,53.96,60.5,65.0,65.0,56.48
2016-04-09,260,36,21,2,6,42,20,5,0,4,79,...,8.0,5.0,28.73,3.0,-33.0,-18.46,-14.0,-5.0,-2.0,-25.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-30,240,11,26,74,3,8,25,26,44,30,7,...,7.0,7.0,14.15,233.0,222.0,215.70,222.0,226.0,226.0,218.85
2021-05-01,240,0,0,9,1,1,1,1,0,12,6,...,0.0,0.0,3.38,248.0,248.0,241.24,245.0,248.0,248.0,244.62
2021-05-02,240,7,18,9,7,3,10,3,0,27,5,...,1.5,1.5,7.57,196.0,189.0,187.86,190.0,194.5,194.5,188.43
2021-05-03,240,17,18,17,16,25,20,33,11,12,33,...,11.0,11.0,22.47,66.0,49.0,38.06,46.0,55.0,55.0,43.53


In [10]:
smse_det = np.sqrt((combined_data["det-difference"] ** 2).mean())
smse_pred = np.sqrt((combined_data["pred-difference"] ** 2).mean())
smse_med_pred = np.sqrt((combined_data["med-difference"] ** 2).mean())
smse_mode_pred = np.sqrt((combined_data["mode-difference"] ** 2).mean())
smse_mode2_pred = np.sqrt((combined_data["mode2-difference"] ** 2).mean())
smse_dp = np.sqrt((combined_data["dp-difference"] ** 2).mean())
f"{smse_det=}, {smse_pred=}, {smse_med_pred=}, {smse_mode_pred=}, {smse_mode2_pred=}, {smse_dp=}"

'smse_det=176.64536632684124, smse_pred=174.78554859800698, smse_med_pred=178.0343971984182, smse_mode_pred=181.248266292736, smse_mode2_pred=181.3982419767985, smse_dp=175.33069927024997'

In [11]:
mae_det = np.abs(combined_data["det-difference"]).mean()
mae_pred = np.abs(combined_data["pred-difference"]).mean()
mae_med_pred = np.abs(combined_data["med-difference"]).mean()
mae_mode_pred = np.abs(combined_data["mode-difference"]).mean()
mae_mode2_pred = np.abs(combined_data["mode-difference"]).mean()
mae_dp = np.abs(combined_data["dp-difference"]).mean()
f"{mae_det=}, {mae_pred=}, {mae_med_pred=}, {mae_mode_pred=}, {mae_mode2_pred=}, {mae_dp=}"

'mae_det=102.10116801437559, mae_pred=99.86578975741259, mae_med_pred=101.02920035938907, mae_mode_pred=102.8147433259564, mae_mode2_pred=102.8147433259564, mae_dp=100.54199820305476'

In [12]:
smae_det = (np.abs(combined_data["det-difference"])/((np.abs(combined_data["det_run"]) + np.abs(combined_data["RH-fix"])/2))).mean()
smae_pred = (np.abs(combined_data["pred-difference"])/((np.abs(combined_data["mean_pred"]) + np.abs(combined_data["RH-fix"])/2))).mean()
smae_med_pred = (np.abs(combined_data["med-difference"])/((np.abs(combined_data["median_pred"]) + np.abs(combined_data["RH-fix"])/2))).mean()
smae_mode_pred = (np.abs(combined_data["mode-difference"])/((np.abs(combined_data["mode_pred"]) + np.abs(combined_data["RH-fix"])/2))).mean()
smae_mode2_pred = (np.abs(combined_data["mode2-difference"])/((np.abs(combined_data["mode2_pred"]) + np.abs(combined_data["RH-fix"])/2))).mean()
smae_dp = (np.abs(combined_data["dp-difference"])/((np.abs(combined_data["mean_det_pred"]) + np.abs(combined_data["RH-fix"])/2))).mean()
f"{smae_det=}, {smae_pred=}, {smae_med_pred=}, {smae_mode_pred=}, {smae_mode_pred=}, {smae_dp=}"

'smae_det=1.3821680160678238, smae_pred=1.1578375640457983, smae_med_pred=1.3688147324048927, smae_mode_pred=1.6299604087030877, smae_mode_pred=1.6299604087030877, smae_dp=1.1934465043976903'

## Correlation deterministic and predicted

In [13]:
prediction_data[["mean_pred", "det_run", "median_pred", "mode_pred", "mode2_pred"]].corr()

Unnamed: 0,mean_pred,det_run,median_pred,mode_pred,mode2_pred
mean_pred,1.0,0.768792,0.966763,0.801431,0.792919
det_run,0.768792,1.0,0.772988,0.63434,0.630052
median_pred,0.966763,0.772988,1.0,0.865709,0.859313
mode_pred,0.801431,0.63434,0.865709,1.0,0.993714
mode2_pred,0.792919,0.630052,0.859313,0.993714,1.0
