# Build Machine Learning Dataset


## Setup

In [None]:
import os.path as osp
from datetime import datetime, timezone
from dateutil.relativedelta import relativedelta
import synoptic
import json
import sys
import numpy as np
import polars as pl
import pandas as pd
sys.path.append('../src')
from utils import Dict, read_yml, read_pkl, str2time, print_dict_summary, time_range, rename_dict
import ingest.retrieve_raws_api as rr
import ingest.retrieve_raws_stash as rrs
import ingest.retrieve_hrrr_api as ih
from data_funcs import retrieve_fmda_data, combine_fmda_files

In [None]:
with open("../etc/training_data_config.json", "r") as json_file:
    config = json.load(json_file)   
    config = Dict(config)
print_dict_summary(config)

In [None]:
params_data = read_yml("../etc/params_data.yaml")
print_dict_summary(params_data)

In [None]:
raws_stash_path = rrs.raws_meta["raws_stash_path"]
print(raws_stash_path)
osp.exists(raws_stash_path)

## Retrieve Data

Nested dictionary with top level key corresponding to a RAWS and subkeys for RAWS, atmospheric data (HRRR), geographic info, etc

This format is used because different FMC models used in this project require different data formatting. The ODE+KF physics-based model is run pointwise and does not incorporate info from other locations. The static ML models have the least restrictive input data structure, and all observations can be thrown into one set of tabular data. The RNN models require structuring input data with the format (batch_size, timesteps, features). Thus, it is simpler to keep all data separate at separate locations and recombine in various ways at the modeling step. Also, data filters for suspect RAWS sensors are applied in the next step. This is because the raw data retrieval should not depend on hyperparameter choices related to data filters, so it is easier to collect everything and apply filters later.

In [None]:
# config.update({"end_time": "2023-06-03T23:00:00Z"})
# dat = retrieve_fmda_data(config, save_filename="test.pkl")

In [None]:
# config.update({
#     "start_time" : "2023-08-01T00:00:00Z",
#     "end_time" : "2023-08-01T04:00:00Z",
#     "bbox": [38, -107, 47, -98]
# })

# dat = retrieve_fmda_data(config, save_filename="test2.pkl")

In [None]:
paths = ["../data/test.pkl", "../data/test2.pkl"]

In [None]:
raws_dict = combine_fmda_files(paths)

## Filter Data

The file `etc/params_data.yaml` has hyperparameters related to filtering data....

## Setup CV

In [None]:
# Helper function to filter dataframe on time
def filter_df(df, filter_col, ts):
    return df[df[filter_col].isin(ts)]

In [None]:
import random
import reproducibility
import copy

def train_test_split_spacetime(d0, start, end, 
                               space_fracs = [1.0, 0.0, 0.0], 
                               test_time_steps = 2,
                               val_time_steps = 2,
                               verbose=False,random_state = 42):
    """
    Train test split, accounting for spatial and temporal dependence
    """
    # Set up 
    d = copy.deepcopy(d0)
    reproducibility.set_seed(random_state)
    assert np.sum(space_fracs) == 1., f"Provided cross validation param space_fracs don't sum to 1"
    if len(space_fracs) != 3:
        raise ValueError("Cross-validation params `time_fracs` must be list of length 3, representing (train/validation/test)")

    # Temporal setup
    times = time_range(start, end)
    ntimes = len(times)
    train_times = times[0:(ntimes - test_time_steps - val_time_steps)]
    val_times = times[(ntimes - test_time_steps - val_time_steps):(ntimes - test_time_steps)]
    test_times = times[(ntimes-test_time_steps):]
    if verbose:
        print(f"Training period: ({train_times[0]}) to ({train_times[-1]})")
        if len(val_times) >0:
            print(f"Validation period: ({val_times[0]}) to ({val_times[-1]})")
        if len(test_times) >0:
            print(f"Test period: ({test_times[0]}) to ({test_times[-1]})")
    
    # Spatial setup
    stids = [*d.keys()]
    locs = np.arange(len(stids)) # indices of locations
    train_size = int(len(locs) * space_fracs[0])
    val_size = int(np.ceil(len(locs)*space_fracs[1]))
    test_size = len(locs) - train_size - val_size 
    if verbose:
        print(f"Number of unique locations: {len(stids)}")
        print(f"Number of training locs: {train_size}")
        print(f"Number of val locs: {val_size}")
        print(f"Number of test locs: {test_size}")
    

    # Spatial holdout
    random.shuffle(stids)
    train_locs = stids[:train_size]
    val_locs = stids[train_size:(train_size+val_size)]
    test_locs = stids[(train_size+val_size):]
    train_dict = {k: d[k] for k in train_locs}
    val_dict = {k: d[k] for k in val_locs}
    test_dict = {k: d[k] for k in test_locs}

    # Temporal holdout
    for st in train_dict:
        train_dict[st]["times"] = train_times
        train_dict[st]["RAWS"] = filter_df(train_dict[st]["RAWS"], "date_time", train_times)
        train_dict[st]["HRRR"] = filter_df(train_dict[st]["HRRR"], "valid_time", train_times)
        
    for st in val_dict:
        val_dict[st]["times"] = val_times
        val_dict[st]["RAWS"] = filter_df(val_dict[st]["RAWS"], "date_time", val_times)
        val_dict[st]["HRRR"] = filter_df(val_dict[st]["HRRR"], "valid_time", val_times)
                
    for st in test_dict:
        test_dict[st]["times"] = test_times
        test_dict[st]["RAWS"] = filter_df(test_dict[st]["RAWS"], "date_time", test_times)
        test_dict[st]["HRRR"] = filter_df(test_dict[st]["HRRR"], "valid_time", test_times)
                
    
    return train_dict, val_dict, test_dict

In [None]:
train, val, test = train_test_split_spacetime(raws_dict, config.start_time, config.end_time,
                                     space_fracs = [.8, .1, .1],
                                     verbose=True)

In [None]:
train.keys()

In [None]:
train["WPKS2"]["RAWS"]

In [None]:
train["WPKS2"]["HRRR"]

In [None]:
val.keys()

In [None]:
val["TS897"]["RAWS"]

In [None]:
val["TS897"]["HRRR"]

In [None]:
test.keys()

In [None]:
test["SAWW4"]["RAWS"]

In [None]:
test["SAWW4"]["HRRR"]

## Run ODE+KF

The physics-based ODE+KF model does not require any restructuring of the fmda dictionary built above. 

Intended use: run directly on stations identified as test cases

In [None]:
import importlib
import models.moisture_models
importlib.reload(models.moisture_models)
import models.moisture_models as mm

In [None]:
ode = mm.ODE_FMC()

In [None]:
m, errs = ode.run_model(raws_dict, hours=6, h2=2)

In [None]:
m.shape

In [None]:
params = Dict(read_yml("../etc/params_models.yaml", subkey="ode"))
print_dict_summary(params)

In [None]:
config

In [None]:
class ODE_FMC:
    def __init__(self, params):
        # List of required keys
        required_keys = ['spinup_hours',
                         'process_variance',
                         'data_variance',
                         'r0',
                         'rs',
                         'Tr',
                         'S',
                         'T']

        # Validate that all required keys are in params
        missing_keys = [key for key in required_keys if key not in params]
        if missing_keys:
            raise ValueError(f"Missing required keys in params: {missing_keys}")

        # Define params
        self.params = params
        process_variance = np.float_(params['process_variance'])
        self.Q = np.array([[process_variance, 0.],
                           [0., process_variance]])
        self.H = np.array([[1., 0.]]) # observation matrix
        self.R = np.array([np.float_(params['data_variance'])]) # data variance
        self.r0 = params["r0"]
        self.rs = params["rs"]
        self.Tr = params["Tr"]
        self.S = params["S"]
        self.T = params["T"]
    def run_model_single(self, dat, hours, h2, atm_source = "HRRR"):
        """
        Run ODE fuel moisture model on a single location. 
        
        hours : int
            Total hours to run model

        h2 : int
            Hour to turn off data assimilation and run in forecast mode
        
        atm_source: str
            Typically HRRR. Should be able to do RAWS as QC
        """
        Q = self.Q
        R = self.R
        H = self.H
        
        fm = dat["RAWS"]["fm"].to_numpy().astype(np.float64)
        Ed = dat[atm_source]["Ed"].to_numpy().astype(np.float64)
        Ew = dat[atm_source]["Ew"].to_numpy().astype(np.float64)
        rain = dat[atm_source]["rain"].to_numpy().astype(np.float64)

        u = np.zeros((2,hours))
        u[:,0]=[0.1,0.0]       # initialize,background state  
        P = np.zeros((2,2,hours))
        P[:,:,0] = np.array([[self.params['process_variance'], 0.],
                      [0.,  self.params['process_variance']]]) # background state covariance        
        
        # Run in spinup mode
        for t in range(1,h2):
          # use lambda construction to pass additional arguments to the model 
            u[:,t],P[:,:,t] = mm.ext_kf(u[:,t-1],P[:,:,t-1],
                                    lambda uu: mm.model_augmented(uu,Ed[t],Ew[t],rain[t],t),
                                    Q,d=fm[t],H=H,R=R)

        # Run in forecast mode
        for t in range(h2,hours):
            u[:,t],P[:,:,t] = mm.ext_kf(u[:,t-1],P[:,:,t-1],
                                      lambda uu: mm.model_augmented(uu,Ed[t],Ew[t],rain[t],t),
                                      Q*0.0)
          
        return u

    def run_dict(self, dict0, hours, h2, atm_source="HRRR"):
        """
        Run model defined in run_model_single on a dictionary and return 3d array

        Returns
        --------
        u : ndarray
            state vector 3d array of dims (n_locations, timesteps, 2), where 2 dim response is FMC, Ec
        """
        u = []
        for st in dict0:
            ui = self.run_model_single(dict0[st], hours=hours, h2=h2, atm_source=atm_source)
            u.append(ui.T) # transpose to get dimesion (timesteps, response_dim)

        u = np.stack(u, axis=0)
        
        return u

    def slice_fm_forecasts(self, u, h2):
        """
        Given output of run_model, slice array to get only FMC at forecast hours
        """

        return u[:, h2:, 0:1] # Using 0:1 keeps the dimensions, if just 0 it will drop
    
    def eval(self, u, fm, h2):
        """
        Return RMSE of forecast u versus observed FMC
        """
        assert u.shape == fm.shape, "Arrays must have the same shape."
        # Reshape to 2D: (N * timesteps, features)
        fm2 = x.reshape(-1, x.shape[-1])
        u2 = u.reshape(-1, u.shape[-1])
        rmse = np.sqrt(mean_squared_error(u2, fm2))
    
        return rmse

    def run_model(self, dict0, hours, h2, atm_source="HRRR"):
        """
        Put it all together
        """

        # Get array of response
        fm_arrays = [raws_dict[loc]["RAWS"]["fm"].values[h2:, np.newaxis] for loc in raws_dict]
        fm = np.stack(fm_arrays, axis=0)

        # Get forecasts
        preds = self.run_dict(dict0, hours=hours, h2=h2, atm_source=atm_source)
        m = self.slice_fm_forecasts(preds, h2 = h2)

        rmse = self.eval(m, fm, h2)

        return m, rmse

In [None]:
mod = ODE_FMC(params)

In [None]:
import models.moisture_models as mm
d = raws_dict["BRLW4"]
m, Ec = mod.run_model_single(d, hours = 6, h2 = 2)

In [None]:
m.shape

In [None]:
Ec.shape

In [None]:
us = mod.run_dict(raws_dict, hours=6, h2=2)

In [None]:
u = mod.slice_fm_forecasts(us, h2 = 2)

In [None]:
h2 = 2
fm_arrays = [raws_dict[loc]["RAWS"]["fm"].values[h2:, np.newaxis] for loc in raws_dict]
x = np.stack(fm_arrays, axis=0)

In [None]:
x.shape

In [None]:
from sklearn.metrics import mean_squared_error



In [None]:
rmse

In [None]:
m, errs = mod.run_model(raws_dict, hours=6, h2=2)

In [None]:
m.shape

In [None]:
errs