# Investigate Modeling Errors

Manually investigate FMC data at all stations for study area and time period. Identify stretches of data as valid/invalid and directly remove. 

**Methodology:** 

* Split FMC data into 72 hour periods
* Plot FMC with corresponding equilibria calculated from HRRR for 5 periods at a time (for readability)
* Manually identify periods of data that are *clearly* invalid
* Build dataset with STID, timeperiods, and labels for valid/invalid
* Manually remove invalid data stretches from data for this analysis

**Future Work:** Use labeled dataset to train a classifier for automatic flagging of suspect data.

## Setup

In [None]:
import matplotlib.pyplot as plt
from datetime import datetime
import sys
import os.path as osp
import pandas as pd
import numpy as np
from scipy import stats
import itertools
import os.path as osp
import re
import copy
sys.path.append("../src")
from data_funcs import remove_invalid_data
from utils import Dict, read_yml, str2time, print_dict_summary, read_pkl, retrieve_url
from viz import plot_one

In [None]:
data_dir = "../outputs/report_materials"
df = pd.read_csv("../data/fmc_valid_checks_rocky24.csv")
ml_data = read_pkl(osp.join(data_dir, "ml_data.pkl"))

## Plot FMC

In [None]:
# outpath = "../outputs/ts_plots"
# n_periods = 5
# for st in ml_data:
#     d = ml_data[st]['data']
#     max_period = d.st_period.max()
#     for start in range(0, max_period + 1, n_periods):
#         batch = list(range(start, min(start + n_periods, max_period + 1)))
#         ts = [d[d['st_period'].isin([bi])].date_time.min() for bi in batch]
#         t0 = d[d['st_period'].isin(batch)].date_time.min()
#         t1 = d[d['st_period'].isin(batch)].date_time.max()
#         print("~"*50)
#         print(f"Running batch for station {st}")
#         print(f"Start time: {t0}")
#         print(f"End time: {t1}")
#         out_file = osp.join(outpath, f"{st}_{batch[0]}_{batch[-1]}.png")
#         plot_one(ml_data, st, start_time = t0, end_time = t1, title2 = f"Periods {batch}", 
#                          save_path = None, show=True)
#         plot_periods = [(b, t) for b, t in zip(batch, ts) if not pd.isna(t)]
#         for b, t in plot_periods:
#             plt.axvline(x=t, color='black', linestyle='dotted')
#             plt.text(t, plt.ylim()[1], str(b), verticalalignment='top', horizontalalignment="right", color='black')  # Annotate
        
#         plt.savefig(out_file)  
#         plt.close()

## Build Labeled Dataset

Read in file created with manual check of timeseries, convert into format usable to filter data

* For periods labeled as valid/invalid, extract time start and time end for those periods from ml_data
* Build dataframe with columns `STID, start_time, end_time, valid`

*TODO:* make missing data missing in plot, rather than connected with straight line like now

In [None]:
df

In [None]:
# Set up restructured dataframe
df_valid = pd.DataFrame(columns=['stid', 'start', 'end', 'valid']).astype({
    'stid': 'string',
    'start': 'string',
    'end': 'string',
    'valid': 'int'
})

pattern = r"^(\d+)(?:\s*,\s*(\d+))?$" # Use to extract period integers start_period, end_period e.g. (0, 243)

for i in range(0, df.shape[0]):
    st = df.stid[i]
    d = ml_data[st]["data"]
    s = df[df.index == i].periods.values[0]
    vi = df[df.index == i].valid.values[0]
    pstart, pend = re.match(pattern, s).groups()
    # Handle whether single period or range
    if pend is None:
        periods = [int(pstart)]
    else:
        periods = np.arange(int(pstart), int(pend)+1, step=1)
    
    t0 = d[d.st_period.isin(periods)].date_time.min() # start time of period range
    t1 = d[d.st_period.isin(periods)].date_time.max() # end time for period range

    di = pd.DataFrame({
        'stid': [st],
        'start': [t0.strftime("%Y-%m-%dT%H:%M:%SZ")],
        'end': [t1.strftime("%Y-%m-%dT%H:%M:%SZ")],
        'valid': [vi]
    })
    
    df_valid = pd.concat([df_valid, di], ignore_index = True)

In [None]:
df_valid

In [None]:
assert df_valid.stid.unique().shape[0] == len(ml_data), f"Mismatch number of unique stations, {df_valid.stid.unique().shape} in processed dataframe but {len(ml_data)} in input ml_data"

In [None]:
# Write output
df_valid.to_csv("../data/fmc_valid_rocky24.csv")

## Remove Invalid Data

Based on manual determination, modify input data dictionary to remove bad data.

In [None]:
ml_data2 = remove_invalid_data(ml_data, df_valid)

## Double Check big errors

In [None]:
df2 = df.sort_values(by="RNN", ascending=False)
df2

In [None]:
t0 = df2.index[0]

print(f"Forecast Period {t0}, Resulting MSE: {df2.RNN[df2.index == t0]}")

In [None]:
ml_data.keys()

In [None]:
np.where(df.index == t0)

In [None]:
# fperiod_errs_55.pkl
# fperiod_errs_57.pkl

In [None]:
t0

## First bad error

In [None]:
task_id = 57
ft = str2time("2024-04-22T00:00:00Z") 
TRAIN_HOURS = 8760
FORECAST_HOURS = 48

import data_funcs
import reproducibility

reproducibility.set_seed(task_id)
train, val, test = data_funcs.cv_data_wrap(ml_data, ft, train_hours=TRAIN_HOURS,forecast_hours=FORECAST_HOURS)

In [None]:
from models.moisture_ode import ODE_FMC
params_models = read_yml('../etc/params_models.yaml')

In [None]:
    # Run Models
    # ODE
    print('~'*75)
    params = params_models['ode']
    te_sts = [*test.keys()]
    test_times = test[te_sts[0]]["times"]
    ode_data = data_funcs.get_ode_data(ml_data, te_sts, test_times)
    ode = ODE_FMC(params=params)
    m, errs_ode = ode.run_model(ode_data, hours=72, h2=24)
    print(f"ODE Test MSE: {errs_ode}")

In [None]:
errs_ode['loc_mse'].max()

In [None]:
errs_ode['loc_mse'].argmax()

In [None]:
len(errs_ode['loc_mse'])

In [None]:
st = te_sts[errs_ode['loc_mse'].argmax()]
st

In [None]:
np.where(ml_data[st]["times"] == pd.Timestamp(t0))

In [None]:
plt.plot(ml_data[st]["times"][7000:8500], ml_data[st]["data"]["fm"][7000:8500], 'o--', ms=4, markerfacecolor='none', label="FMC")
plt.plot(ml_data[st]["times"][7000:8500], ml_data[st]["data"]["Ed"][7000:8500], 'o--', ms=4, markerfacecolor='none', label="Ed")
plt.xticks(rotation=90) 
plt.legend()

In [None]:
import synoptic

In [None]:
st

In [None]:
ts = synoptic.TimeSeries(
    stid = st,
    start = str2time("2024-03-15T00:00:00Z"),
    end = str2time("2024-06-01T00:00:00Z"),
    vars = ["fuel_moisture"],
    units="metric"
)

In [None]:
df = ts.df()

In [None]:
df

In [None]:
plt.plot(df['date_time'], df['value'])

## Second Bad Error

Same Station

In [None]:
task_id = 55
ft = str2time("2024-04-18T00:00:00Z") 
TRAIN_HOURS = 8760
FORECAST_HOURS = 48

import data_funcs
import reproducibility

reproducibility.set_seed(task_id)
train, val, test = data_funcs.cv_data_wrap(ml_data, ft, train_hours=TRAIN_HOURS,forecast_hours=FORECAST_HOURS)

In [None]:
    # Run Models
    # ODE
    print('~'*75)
    params = params_models['ode']
    te_sts = [*test.keys()]
    test_times = test[te_sts[0]]["times"]
    ode_data = data_funcs.get_ode_data(ml_data, te_sts, test_times)
    ode = ODE_FMC(params=params)
    m, errs_ode = ode.run_model(ode_data, hours=72, h2=24)
    print(f"ODE Test MSE: {errs_ode}")

In [None]:
print(errs_ode['loc_mse'].max())
st = te_sts[errs_ode['loc_mse'].argmax()]
print(st)

In [None]:
st

In [None]:
np.mean(np.array([  19.09864272,   57.10380271,   38.49150197,   15.23259169,
          4.94547573,   15.07289635,   12.06879773,   23.4615982 ,
         28.03123544, 1090.3702355 ,   38.98030805,   40.270528  ,
         15.62551401,   19.10600825,   29.41379745,   73.03861655]))

In [None]:
np.mean(np.array([  19.09864272,   57.10380271,   38.49150197,   15.23259169,
          4.94547573,   15.07289635,   12.06879773,   23.4615982 ,
         28.03123544 ,   38.98030805,   40.270528  ,
         15.62551401,   19.10600825,   29.41379745,   73.03861655]))

In [None]:
np.sqrt(28.7)

## Another bad station

In [None]:
st = "C3ELK"

In [None]:
plt.plot(ml_data[st]["times"], ml_data[st]["data"]["fm"], 'o--', ms=4, markerfacecolor='none')
plt.xticks(rotation=90)