# Build Machine Learning Dataset


## Setup

In [None]:
import os.path as osp
from datetime import datetime, timezone
from dateutil.relativedelta import relativedelta
import synoptic
import json
import sys
import numpy as np
import polars as pl
import pandas as pd
sys.path.append('../src')
from utils import Dict, read_yml, read_pkl, str2time, print_dict_summary, time_range, rename_dict
import ingest.retrieve_raws_api as rr
import ingest.retrieve_raws_stash as rrs
import ingest.retrieve_hrrr_api as ih

In [None]:
with open("../etc/training_data_config.json", "r") as json_file:
    config = json.load(json_file)   
    config = Dict(config)
print_dict_summary(config)

In [None]:
params_data = read_yml("../etc/params_data.yaml")
print_dict_summary(params_data)

In [None]:
raws_stash_path = rrs.raws_meta["raws_stash_path"]
print(raws_stash_path)
osp.exists(raws_stash_path)

## Retrieve Data

Nested dictionary with top level key corresponding to a RAWS and subkeys for RAWS, atmospheric data (HRRR), geographic info, etc

This format is used because different FMC models used in this project require different data formatting. The ODE+KF physics-based model is run pointwise and does not incorporate info from other locations. The static ML models have the least restrictive input data structure, and all observations can be thrown into one set of tabular data. The RNN models require structuring input data with the format (batch_size, timesteps, features). Thus, it is simpler to keep all data separate at separate locations and recombine in various ways at the modeling step. Also, data filters for suspect RAWS sensors are applied in the next step. This is because the raw data retrieval should not depend on hyperparameter choices related to data filters, so it is easier to collect everything and apply filters later.

In [None]:
raws_dict = rrs.build_raws_dict(config)

In [None]:
print_dict_summary(raws_dict)

In [None]:
hrrr_ds = ih.retrieve_hrrr(config)

In [None]:
hrrr_pts = ih.subset_hrrr2raws(hrrr_ds, raws_dict)
hrrr_pts = ih.rename_ds(hrrr_pts)

In [None]:
hrrr_pts

In [None]:
# Check same STIDs
np.all(hrrr_pts.point_stid.to_numpy() == np.array([*raws_dict.keys()]))

In [None]:
for st in raws_dict:

    # Comfirm times match. For HRRR data it should be the valid_time which accounts for forecast hour
    # raws_timesi = np.array([dt.replace(tzinfo=None) for dt in raws_dict[st]["times"]], dtype="datetime64")
    raws_timesi = raws_dict[st]["times"]
    assert np.all(raws_timesi == hrrr_pts.valid_time.to_numpy()), "Times in RAWS dict don't match HRRR data valid_time"

    # Extract dataframe of predictors, save in HRRR subdictionary
    df = hrrr_pts.where(hrrr_pts.point_stid == st, drop=True).to_dataframe()
    df.reset_index('point', drop=True, inplace=True)
    raws_dict[st]["HRRR"] = df

In [None]:
raws_dict["BRLW4"]["HRRR"]

In [None]:
print_dict_summary(raws_dict)

In [None]:
config

In [None]:
osp.join("data", config.training_data_filename)

In [None]:
config

## Filter Data

The file `etc/params_data.yaml` has hyperparameters related to filtering data....

## Setup CV

In [None]:
# Helper function to filter dataframe on time
def filter_df(df, filter_col, ts):
    return df[df[filter_col].isin(ts)]

In [None]:
import random
import reproducibility
import copy

def train_test_split_spacetime(d0, start, end, 
                               space_fracs = [1.0, 0.0, 0.0], 
                               test_time_steps = 2,
                               val_time_steps = 2,
                               verbose=False,random_state = 42):
    """
    Train test split, accounting for spatial and temporal dependence
    """
    # Set up 
    d = copy.deepcopy(d0)
    reproducibility.set_seed(random_state)
    assert np.sum(space_fracs) == 1., f"Provided cross validation param space_fracs don't sum to 1"
    if len(space_fracs) != 3:
        raise ValueError("Cross-validation params `time_fracs` must be list of length 3, representing (train/validation/test)")

    # Temporal setup
    times = time_range(start, end)
    ntimes = len(times)
    train_times = times[0:(ntimes - test_time_steps - val_time_steps)]
    val_times = times[(ntimes - test_time_steps - val_time_steps):(ntimes - test_time_steps)]
    test_times = times[(ntimes-test_time_steps):]
    if verbose:
        print(f"Training period: ({train_times[0]}) to ({train_times[-1]})")
        if len(val_times) >0:
            print(f"Validation period: ({val_times[0]}) to ({val_times[-1]})")
        if len(test_times) >0:
            print(f"Test period: ({test_times[0]}) to ({test_times[-1]})")
    
    # Spatial setup
    stids = [*d.keys()]
    locs = np.arange(len(stids)) # indices of locations
    train_size = int(len(locs) * space_fracs[0])
    val_size = int(np.ceil(len(locs)*space_fracs[1]))
    test_size = len(locs) - train_size - val_size 
    if verbose:
        print(f"Number of unique locations: {len(stids)}")
        print(f"Number of training locs: {train_size}")
        print(f"Number of val locs: {val_size}")
        print(f"Number of test locs: {test_size}")
    

    # Spatial holdout
    random.shuffle(stids)
    train_locs = stids[:train_size]
    val_locs = stids[train_size:(train_size+val_size)]
    test_locs = stids[(train_size+val_size):]
    train_dict = {k: d[k] for k in train_locs}
    val_dict = {k: d[k] for k in val_locs}
    test_dict = {k: d[k] for k in test_locs}

    # Temporal holdout
    for st in train_dict:
        train_dict[st]["times"] = train_times
        train_dict[st]["RAWS"] = filter_df(train_dict[st]["RAWS"], "date_time", train_times)
        train_dict[st]["HRRR"] = filter_df(train_dict[st]["HRRR"], "valid_time", train_times)
        
    for st in val_dict:
        val_dict[st]["times"] = val_times
        val_dict[st]["RAWS"] = filter_df(val_dict[st]["RAWS"], "date_time", val_times)
        val_dict[st]["HRRR"] = filter_df(val_dict[st]["HRRR"], "valid_time", val_times)
                
    for st in test_dict:
        test_dict[st]["times"] = test_times
        test_dict[st]["RAWS"] = filter_df(test_dict[st]["RAWS"], "date_time", test_times)
        test_dict[st]["HRRR"] = filter_df(test_dict[st]["HRRR"], "valid_time", test_times)
                
    
    return train_dict, val_dict, test_dict

In [None]:
a, b, c = train_test_split_spacetime(raws_dict, config.start_time, config.end_time,
                                     space_fracs = [.8, .1, .1],
                                     verbose=True)

In [None]:
a.keys()

In [None]:
a["WPKS2"]["RAWS"]

In [None]:
a["WPKS2"]["HRRR"]

In [None]:
b.keys()

In [None]:
b["NMOS2"]["RAWS"]

In [None]:
b["NMOS2"]["HRRR"]

In [None]:
c.keys()

In [None]:
c["BRLW4"]["RAWS"]

In [None]:
c["BRLW4"]["HRRR"]

## Run ODE+KF

The physics-based ODE+KF model does not require any restructuring of the fmda dictionary built above. 

Intended use: run directly on stations identified as test cases

In [None]:
params = Dict(read_yml("../etc/params_models.yaml", subkey="ode"))
print_dict_summary(params)

In [None]:
config

In [None]:
class ODE_FMC:
    def __init__(self, params):
        # List of required keys
        required_keys = ['spinup_hours',
                         'process_variance',
                         'data_variance',
                         'r0',
                         'rs',
                         'Tr',
                         'S',
                         'T']

        # Validate that all required keys are in params
        missing_keys = [key for key in required_keys if key not in params]
        if missing_keys:
            raise ValueError(f"Missing required keys in params: {missing_keys}")

        # Define params
        process_variance = np.float_(params['process_variance'])
        self.Q = np.array([[process_variance, 0.],
                           [0., process_variance]])
        self.H = np.array([[1., 0.]]) # observation matrix
        self.R = np.array([np.float_(params['data_variance'])]) # data variance

In [None]:
mod = ODE_FMC(params)

In [None]:
hours = 10
P = np.zeros((2,2,hours))
P.shape

In [None]:
P[:,:,0] = np.array([[1e-3, 0.],
                      [0.,  1e-3]])

In [None]:
P

In [None]:
P.shape

In [None]:
import importlib
import models.moisture_models
importlib.reload(models.moisture_models)
import models.moisture_models as mm

In [None]:
outputs_kf = {}
for st in raws_dict:
    print("~"*50)
    print(st)
    # # Run Augmented KF
    # print('Running Augmented KF')
    # train[case]['h2'] = test_ind
    # train[case]['hours'] =len(train[case]['y'])
    # train[case]['scale_fm'] = 1
    # m, Ec = run_augmented_kf(train[case])
    # y = train[case]['y']        
    # train[case]['m_kf'] = m
    # print(f"KF RMSE: {rmse(m[test_ind:],y[test_ind:])}")
    # outputs_kf[case] = {'case':case, 'errs': rmse(m[test_ind:],y[test_ind:])}    