# Investigate Modeling Errors

**Methodology:** manually investigate FMC data at stations and times when there are substantial modeling errors for both the ODE and the RNN. Since the ODE uses a non-statistical methodology and does not use scaled data, a substantial modeling error for both those methods seems more likely to be from erroneous data rather than a weak model. 

Steps:
* Run forecast analysis to produce MSE for the ODE and RNN
* Extract cases where MSE is outside of 1sd of both the ODE and RNN errors
* Manually label stretches of data for filtering
* Rerun forecast analysis with those data labeled as erroneous removed

**Future Goal:** a couple of options for a better data filtering algortithm:

* use labeled dataset to train a ML classifier to automatically filter suspect data based on manual checking rather than a series of ad hoc rules.
* automatically reject inputs where there are substantial errors for the ODE and RNN, rerun and see if it converges

In [None]:
import matplotlib.pyplot as plt
from datetime import datetime
import sys
import os.path as osp
import pandas as pd
import numpy as np
from scipy import stats
import itertools
import os.path as osp
sys.path.append("../src")
from utils import Dict, read_yml, str2time, print_dict_summary, read_pkl, retrieve_url

In [None]:
data_dir = "../outputs/report_materials"

df = pd.read_csv(osp.join(data_dir, "forecast_errs.csv"), index_col=0)
rnn = pd.read_csv(osp.join(data_dir, "rnn_loc_errors.csv"), index_col=0)
ml_data = read_pkl(osp.join(data_dir, "ml_data.pkl"))

In [None]:
df2 = df.sort_values(by="RNN", ascending=False)
df2

In [None]:
t0 = df2.index[0]
t1 = df2.index[1]

print(f"Forecast Period {t0}, Resulting MSE: {df2.RNN[df2.index == t0]}")
print(f"Forecast Period {t1}, Resulting MSE: {df2.RNN[df2.index == t1]}")

In [None]:
ml_data.keys()

In [None]:
np.where(df.index == t0)

In [None]:
# fperiod_errs_55.pkl
# fperiod_errs_57.pkl

In [None]:
t0

## First bad error

In [None]:
task_id = 57
ft = str2time("2024-04-22T00:00:00Z") 
TRAIN_HOURS = 8760
FORECAST_HOURS = 48

import data_funcs
import reproducibility

reproducibility.set_seed(task_id)
train, val, test = data_funcs.cv_data_wrap(ml_data, ft, train_hours=TRAIN_HOURS,forecast_hours=FORECAST_HOURS)

In [None]:
from models.moisture_ode import ODE_FMC
params_models = read_yml('../etc/params_models.yaml')

In [None]:
    # Run Models
    # ODE
    print('~'*75)
    params = params_models['ode']
    te_sts = [*test.keys()]
    test_times = test[te_sts[0]]["times"]
    ode_data = data_funcs.get_ode_data(ml_data, te_sts, test_times)
    ode = ODE_FMC(params=params)
    m, errs_ode = ode.run_model(ode_data, hours=72, h2=24)
    print(f"ODE Test MSE: {errs_ode}")

In [None]:
errs_ode['loc_mse'].max()

In [None]:
errs_ode['loc_mse'].argmax()

In [None]:
len(errs_ode['loc_mse'])

In [None]:
st = te_sts[errs_ode['loc_mse'].argmax()]
st

In [None]:
np.where(ml_data[st]["times"] == pd.Timestamp(t0))

In [None]:
plt.plot(ml_data[st]["times"][7000:8500], ml_data[st]["data"]["fm"][7000:8500], 'o--', ms=4, markerfacecolor='none', label="FMC")
plt.plot(ml_data[st]["times"][7000:8500], ml_data[st]["data"]["Ed"][7000:8500], 'o--', ms=4, markerfacecolor='none', label="Ed")
plt.xticks(rotation=90) 
plt.legend()

In [None]:
import synoptic

In [None]:
st

In [None]:
ts = synoptic.TimeSeries(
    stid = st,
    start = str2time("2024-03-15T00:00:00Z"),
    end = str2time("2024-06-01T00:00:00Z"),
    vars = ["fuel_moisture"],
    units="metric"
)

In [None]:
df = ts.df()

In [None]:
df

In [None]:
plt.plot(df['date_time'], df['value'])

## Second Bad Error

Same Station

In [None]:
task_id = 55
ft = str2time("2024-04-18T00:00:00Z") 
TRAIN_HOURS = 8760
FORECAST_HOURS = 48

import data_funcs
import reproducibility

reproducibility.set_seed(task_id)
train, val, test = data_funcs.cv_data_wrap(ml_data, ft, train_hours=TRAIN_HOURS,forecast_hours=FORECAST_HOURS)

In [None]:
    # Run Models
    # ODE
    print('~'*75)
    params = params_models['ode']
    te_sts = [*test.keys()]
    test_times = test[te_sts[0]]["times"]
    ode_data = data_funcs.get_ode_data(ml_data, te_sts, test_times)
    ode = ODE_FMC(params=params)
    m, errs_ode = ode.run_model(ode_data, hours=72, h2=24)
    print(f"ODE Test MSE: {errs_ode}")

In [None]:
print(errs_ode['loc_mse'].max())
st = te_sts[errs_ode['loc_mse'].argmax()]
print(st)

In [None]:
st

In [None]:
np.mean(np.array([  19.09864272,   57.10380271,   38.49150197,   15.23259169,
          4.94547573,   15.07289635,   12.06879773,   23.4615982 ,
         28.03123544, 1090.3702355 ,   38.98030805,   40.270528  ,
         15.62551401,   19.10600825,   29.41379745,   73.03861655]))

In [None]:
np.mean(np.array([  19.09864272,   57.10380271,   38.49150197,   15.23259169,
          4.94547573,   15.07289635,   12.06879773,   23.4615982 ,
         28.03123544 ,   38.98030805,   40.270528  ,
         15.62551401,   19.10600825,   29.41379745,   73.03861655]))

In [None]:
np.sqrt(28.7)

## Another bad station

In [None]:
st = "C3ELK"

In [None]:
plt.plot(ml_data[st]["times"], ml_data[st]["data"]["fm"], 'o--', ms=4, markerfacecolor='none')
plt.xticks(rotation=90)