# v2.1 exploration trying to make it work better

In [1]:
# Environment
import os
import os.path as osp
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sys
# Local modules
sys.path.append('..')
import reproducibility
import pandas as pd
from utils import print_dict_summary
from data_funcs import rmse, process_train_dict
from moisture_rnn import RNNParams, RNNData, RNN, RNN_LSTM
from moisture_rnn_pkl import pkl2train
from tensorflow.keras.callbacks import Callback
from utils import hash2
import copy
import logging
import pickle
from utils import logging_setup, read_yml, read_pkl, hash_ndarray, hash_weights
import yaml
import copy

In [2]:
logging_setup()

## Test Data

In [3]:
filename="fmda_rocky_202403-05_f05.pkl"
from utils import retrieve_url
retrieve_url(
    url = f"https://demo.openwfm.org/web/data/fmda/dicts/{filename}", 
    dest_path = f"data/{filename}")

Target data already exists at data/fmda_rocky_202403-05_f05.pkl


In [4]:
input_file_path = f"data/{filename}"

In [5]:
import importlib
import data_funcs
importlib.reload(data_funcs)
from data_funcs import build_train_dict

In [None]:
train = build_train_dict('data/test_CA_202401.pkl', 
                         features_all=['Ed', 'Ew', 'solar', 'wind', 'elev', 'lon', 'lat', 'doy', 'hod', 'rain'])

In [None]:
train = build_train_dict('data/test_CA_202401.pkl', 
                         features_all=['Ed', 'Ew', 'solar', 'wind', 'elev', 'lon', 'lat', 'doy', 'hod', 'rain'])

In [6]:
dat = read_pkl('data/test_CA_202401.pkl')

loading file data/test_CA_202401.pkl


In [7]:
feature_types = {
    # Static features are based on physical location, e.g. location of RAWS site
    'static': ['elev', 'lon', 'lat'],
    # Atmospheric weather features come from either RAWS subdict or HRRR
    'atm': ['temp', 'rh', 'wind', 'solar', 'soilm', 'canopyw', 'groundflux', 'Ed', 'Ew'],
    # Features that require calculation. NOTE: rain only calculated in HRRR, not RAWS
    'engineered': ['doy', 'hod', 'rain']
}

In [None]:
subdict = dat['CNFC1_202401']

In [None]:
subdict['HRRR']['time']

In [15]:
from utils import str2time, check_increment

In [16]:
dat = read_pkl('data/test_CA_202401.pkl')

loading file data/test_CA_202401.pkl


In [17]:
key = 'CRVC1_202401'
subdict = dat[key]
subdict.keys()

dict_keys(['loc', 'RAWS', 'HRRR'])

In [18]:
def check_feat(feat, d):
    if feat not in d:
        raise ValueError(f"Feature {feat} not found")

In [19]:
def get_time(d, atm="HRRR"):
    check_feat('time', d[atm])
    time = str2time(d[atm]['time'])
    return time

In [20]:
time = get_time(subdict)
hours = len(time)

In [21]:
hrrr_increment = check_increment(time,id=key+f' {"HRRR"}.time')
if  hrrr_increment < 1:
    # logging.critical('HRRR increment is %s h must be at least 1 h',hrrr_increment)
    raise(ValueError)

2024-10-02 14:47:22,924 - INFO - CRVC1_202401 HRRR.time time array increments are 1.0 hours


In [22]:
feature_types['static']

['elev', 'lon', 'lat']

In [23]:
def get_static(d, hours):
    cols = []
    # Use all static vars, don't allow for missing 
    names = feature_types['static']
    for feat in names:
        check_feat(feat, d['loc'])
        cols.append(np.full(hours,d['loc'][feat]))
    return cols, names

In [33]:
static_vars, static_names = get_static(subdict, hours)

In [34]:
len(static_vars)

3

In [35]:
cols = []
names = []

In [36]:
cols.append(time)
names.append('time')

In [37]:
cols.extend(static_vars)
names.extend(static_names)

In [38]:
len(cols)

4

In [39]:
len(names)

4

In [40]:
def get_hrrr_atm(d, fstep):
    cols = []
    # Use all names, don't allow for missing 
    names = feature_types['atm'].copy()
    for feat in names:
        check_feat(feat, d["HRRR"][fstep])
        v = d["HRRR"][fstep][feat] 
        cols.append(v)
    return cols, names

In [41]:
forecast_step = 1

In [42]:
if forecast_step > 0 and forecast_step < 100 and forecast_step == int(forecast_step):
    fstep='f'+str(forecast_step).zfill(2)
    fprev='f'+str(forecast_step-1).zfill(2)
    # logging.info('Using data from step %s',fstep)
    # logging.info('Using rain as the difference of accumulated precipitation between %s and %s',fstep,fprev)
else:
    # logging.critical('forecast_step must be integer between 1 and 99')
    raise ValueError('bad forecast_step')

In [45]:
atm_cols, atm_names = get_hrrr_atm(subdict, fstep)

In [46]:
len(atm_cols)

9

In [48]:
cols.extend(atm_cols)
names.extend(atm_names)

In [49]:
len(cols)

13

In [50]:
len(names)

13

In [51]:
def calc_time_features(time):
    names = ['doy', 'hod']
    doy = np.array([dt.timetuple().tm_yday - 1 for dt in time])
    hod = time.astype('datetime64[h]').astype(int) % 24
    cols = [doy, hod]
    return cols, names

In [58]:
tvars, tnames = calc_time_features(time)

In [59]:
len(tvars)

2

In [60]:
len(tnames)

2

In [62]:
cols.extend(tvars)
names.extend(tnames)

In [63]:
len(cols)

15

In [64]:
len(names)

15

In [65]:
def calc_hrrr_rain(d, fstep):
    rain = d["HRRR"][fstep]['precip_accum']- d["HRRR"][fprev]['precip_accum']
    return rain

In [66]:
rain = calc_hrrr_rain(subdict, fstep)

In [67]:
cols.append(rain)
names.append("rain")

In [68]:
len(cols)

16

In [69]:
len(names)

16

In [70]:
names

['time',
 'elev',
 'lon',
 'lat',
 'temp',
 'rh',
 'wind',
 'solar',
 'soilm',
 'canopyw',
 'groundflux',
 'Ed',
 'Ew',
 'doy',
 'hod',
 'rain']

In [72]:
X = np.column_stack(cols)
X.shape

(168, 16)

In [73]:
from utils import time_intp

In [74]:
feature_types['atm']

['temp', 'rh', 'wind', 'solar', 'soilm', 'canopyw', 'groundflux', 'Ed', 'Ew']

In [75]:
def get_raws_atm(d, time):
    # may not be the same as requested time vector, used to interpolate to input time
    time_raws=str2time(d['RAWS']['time_raws']) 

    cols = []
    names = []

    # Loop through all features, including rain
    for feat in feature_types['atm']+['rain']:
        if feat in d['RAWS']:
            v = d['RAWS'][feat]
            v = time_intp(time_raws, v, time)
            assert len(v)==len(time), f"RAWS feature {feat} not equal length to input time: {len(v)} vs {len(time)}"
            cols.append(v)
            names.append(feat)
    return cols, names

In [76]:
feature_types['atm']

['temp', 'rh', 'wind', 'solar', 'soilm', 'canopyw', 'groundflux', 'Ed', 'Ew']

In [77]:
feature_types['atm']+['rain']

['temp',
 'rh',
 'wind',
 'solar',
 'soilm',
 'canopyw',
 'groundflux',
 'Ed',
 'Ew',
 'rain']

In [86]:
cols2, names2 = get_raws_atm(subdict, time)

In [87]:
len(cols2)

7

In [88]:
names2

['temp', 'rh', 'wind', 'solar', 'Ed', 'Ew', 'rain']

In [90]:
def build_single_case(subdict, atm ="HRRR"):
    # cols = []
    # names = []
    # Get Time variable
    time = get_time(subdict)
    # Calculate derived time variables
    tvars, tnames = calc_time_features(time)
    # Get Static Features, extends to hours
    static_vars, static_names = get_static(subdict, hours = len(time))
    # Get atmospheric variables based on data source. HRRR requires rain calculation
    if atm == "HRRR":
        atm_vars, atm_names = get_hrrr_atm(subdict, fstep)
        rain = calc_hrrr_rain(subdict, fstep)
        atm_vars.append(rain)
        atm_names.append("rain")
    elif atm == "RAWS":
        atm_vars, atm_names = get_raws_atm(subdict, time)
    else:
        raise ValueError(f"Unrecognized atmospheric data source: {atm}")
    # Put everything together and stack
    cols = [time] + tvars + static_vars + atm_vars
    X = np.column_stack(cols)
    names = ['time'] + tnames + static_names + atm_names
    
    return X, names

In [91]:
subdict.keys()

dict_keys(['loc', 'RAWS', 'HRRR'])

In [110]:
X, ns = build_single_case(subdict, atm="RAWS")

In [111]:
X.shape

(168, 13)

In [112]:
len(ns)

13

In [113]:
ns[0:20]

['time',
 'doy',
 'hod',
 'elev',
 'lon',
 'lat',
 'temp',
 'rh',
 'wind',
 'solar',
 'Ed',
 'Ew',
 'rain']

In [114]:
time.shape

(168,)

In [115]:
len(static_vars)

3

In [116]:
len(atm_cols)

9

In [117]:
len([time] + static_vars + atm_cols)

13

In [118]:
def get_fm(d, time):
    fm = d['RAWS']['fm']
    time_raws = str2time(d['RAWS']['time_raws'])
    return time_intp(time_raws,fm,time)

In [119]:
get_fm(subdict, time)

array([12.5       , 12.48833333, 12.42333333, 12.62333333, 12.84666667,
       13.24666667, 13.69333333, 14.48166667, 15.18166667, 15.88166667,
       16.54666667, 16.92333333, 17.17      , 17.74666667, 18.135     ,
       18.42916667, 18.67916667, 18.97      , 19.68666667, 20.52833333,
       15.99      , 14.02666667, 11.235     , 11.45333333, 11.00666667,
       10.23      ,  9.72333333,  9.99333333, 10.72333333, 10.95833333,
       11.47      , 12.035     , 12.35833333, 12.85833333, 13.37      ,
       13.94666667, 14.34666667, 14.68833333, 14.72833333, 15.72333333,
       15.935     , 16.38666667, 17.60166667, 15.90166667, 14.225     ,
       12.74833333, 11.53      , 10.94166667, 10.465     , 10.14166667,
        9.77      , 10.3       , 10.32333333, 10.535     , 10.8       ,
       10.82333333, 11.05833333, 11.59333333, 12.41666667, 13.41666667,
       14.405     , 15.27      , 15.84666667, 16.165     , 16.04      ,
       17.15833333, 17.53      , 16.81333333, 15.24833333, 14.06

In [120]:
from data_funcs import combine_nested
from utils import Dict

In [121]:
def build_train_dict(input_file_paths, spatial=True, atm="HRRR", forecast_step=1, verbose=True):
    new_dict = {}
    for input_file_path in input_file_paths:
        dict0 = read_pkl(input_file_path)
        for key in dict0:
            if verbose:
                print("~"*50)
                print(f"Processing case {key}")
            X, names = build_single_case(dict0[key], atm=atm)
            time = str2time(dict0[key]['HRRR']['time'])
            hrrr_increment = check_increment(time,id=key+f' {"HRRR"}.time')
            if  hrrr_increment < 1:
                # logging.critical('HRRR increment is %s h must be at least 1 h',hrrr_increment)
                raise(ValueError)
            new_dict[key] = {
                'id': key,
                'case': key,
                'filename': input_file_path,
                'loc': dict0[key]['loc'],
                'time': time,
                'X': X,
                'y': get_fm(dict0[key], time),
                'features_list': names
            }
    if spatial:
        new_dict = combine_nested(new_dict)
        
    return Dict(new_dict)


In [122]:
train_test = build_train_dict(["data/test_CA_202401.pkl", "data/test_NW_202401.pkl"])

loading file data/test_CA_202401.pkl
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Processing case CNFC1_202401
2024-10-02 14:50:31,030 - INFO - CNFC1_202401 HRRR.time time array increments are 1.0 hours
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Processing case CRVC1_202401
2024-10-02 14:50:31,038 - INFO - CRVC1_202401 HRRR.time time array increments are 1.0 hours
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Processing case FCHC1_202401
2024-10-02 14:50:31,046 - INFO - FCHC1_202401 HRRR.time time array increments are 1.0 hours
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Processing case FTNC1_202401
2024-10-02 14:50:31,054 - INFO - FTNC1_202401 HRRR.time time array increments are 1.0 hours
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Processing case HTRC1_202401
2024-10-02 14:50:31,061 - INFO - HTRC1_202401 HRRR.time time array increments are 1.0 hours
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Processing case KRNC1_202401
2024-10-02 14:50:31,069

In [123]:
train_test.keys()

dict_keys(['id', 'case', 'filename', 'time', 'X', 'y', 'loc', 'features_list'])

In [125]:
train_test.X[1].shape

(168, 16)

## Test Other ML

In [None]:
params = read_yml("params.yaml", subkey='xgb')
params

In [None]:
dat = read_pkl("data/train.pkl")

In [None]:
cases = [*dat.keys()]

In [None]:
rnn_dat = RNNData(dat[cases[10]], params['scaler'], params['features_list'])
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat.scale_data()

In [None]:
from moisture_models import XGB, RF, LM

In [None]:
mod = XGB(params)

In [None]:
mod.params

In [None]:
mod.fit(rnn_dat.X_train, rnn_dat.y_train)

In [None]:
preds = mod.predict(rnn_dat.X_test)

In [None]:
rmse(preds, rnn_dat.y_test)

In [None]:
plt.plot(rnn_dat.y_test)
plt.plot(preds)

In [None]:
params = read_yml("params.yaml", subkey='rf')
rnn_dat = RNNData(dat[cases[10]], features_list = ['Ed', 'Ew', 'solar', 'wind', 'rain'])
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)

In [None]:
import importlib
import moisture_models
importlib.reload(moisture_models)

In [None]:
params

In [None]:
mod2 = RF(params)
mod2.fit(rnn_dat.X_train, rnn_dat.y_train.flatten())
preds2 = mod2.predict(rnn_dat.X_test)
print(rmse(preds2, rnn_dat.y_test.flatten()))
plt.plot(rnn_dat.y_test)
plt.plot(preds2)

In [None]:
from moisture_models import RF
mod2 = RF(params)

In [None]:
params = read_yml("params.yaml", subkey='lm')
rnn_dat = RNNData(dat[cases[10]], features_list = ['Ed', 'Ew', 'solar', 'wind', 'rain'])
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)
mod = LM(params)

In [None]:
mod.fit(rnn_dat.X_train, rnn_dat.y_train)
preds = mod.predict(rnn_dat.X_test)
print(rmse(preds2, rnn_dat.y_test.flatten()))

## RNN

## Phys Initialized

In [None]:
params.update({
    'epochs':100,
    'dense_layers': 0,
    'activation': ['relu', 'relu'],
    'phys_initialize': False,
    'dropout': [0,0]
})

In [None]:
reproducibility.set_seed()
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat)

In [None]:
rnn.model_train.summary()

In [None]:
params.update({
    'phys_initialize': True,
    'scaler': None, # TODO
    'dense_layers': 0, # NOT including single Dense output layer which is hard-coded
    'activation': ['linear', 'linear'], # TODO tanh, relu the same
    'batch_schedule_type': None # Hopefully this isn't a necessity like before, but maybe it will help
})

In [None]:
rnn_dat2 = RNNData(dat[cases[10]], params['scaler'], params['features_list'])
rnn_dat2.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat2.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN, RNNData

In [None]:
reproducibility.set_seed()

rnn = RNN(params)

In [None]:
m, errs = rnn.run_model(rnn_dat2)

In [None]:
rnn.model_predict.get_weights()

In [None]:
params['rnn_units']

In [None]:
params.update({
    'phys_initialize': True,
    'scaler': None, # TODO
    'dense_layers': 0, # NOT including single Dense output layer which is hard-coded
    'activation': ['relu', 'relu'], # TODO tanh, relu the same
    'batch_schedule_type': None # Hopefully this isn't a necessity like before, but maybe it will help
})

In [None]:
rnn_dat2 = RNNData(dat[cases[10]], params['scaler'], params['features_list'])
rnn_dat2.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat2.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
reproducibility.set_seed()

rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat2)

## LSTM

TODO: FIX BELOW

In [None]:
import importlib 
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN_LSTM

In [None]:
params = read_yml("params.yaml", subkey="lstm")
params = RNNParams(params)

In [None]:
rnn_dat = RNNData(dat[cases[10]], params['scaler'], params['features_list'])
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat.scale_data()
rnn_dat.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
from moisture_rnn import ResetStatesCallback, EarlyStoppingCallback
params.update({'epochs': 50, 'learning_rate': 0.001, 'verbose_fit': True, 'rnn_layers': 2, 'rnn_units': 20, 'dense_layers': 1, 'dense_units': 10,
              'activation': ['tanh', 'tanh'], 'features_list': rnn_dat.features_list,
              'batch_schedule_type':'exp', 'bmin': 10, 'bmax':rnn_dat.hours})
reproducibility.set_seed(123)
lstm = RNN_LSTM(params)

history = lstm.model_train.fit(rnn_dat.X_train, rnn_dat.y_train, 
                    batch_size = params['batch_size'], epochs=params['epochs'], 
                    callbacks = [ResetStatesCallback(params),
                                EarlyStoppingCallback(patience = 15)],
                   validation_data = (rnn_dat.X_val, rnn_dat.y_val))
              

In [None]:
params.update({'epochs': 50, 'learning_rate': 0.001, 'verbose_fit': True, 'rnn_layers': 2, 'rnn_units': 20, 'dense_layers': 1, 'dense_units': 10,
              'activation': ['tanh', 'tanh'], 'features_list': rnn_dat.features_list,
              'batch_schedule_type':'exp', 'bmin': 10, 'bmax':rnn_dat.hours,
              'early_stopping_patience': 25})
reproducibility.set_seed(123)
lstm = RNN_LSTM(params)
m, errs = lstm.run_model(rnn_dat)

In [None]:
rnn_dat.spatial

In [None]:
params = RNNParams(read_yml("params.yaml", subkey='lstm'))
params

In [None]:
train = read_pkl("data/train.pkl")

In [None]:
from itertools import islice
train = {k: train[k] for k in islice(train, 100)}

In [None]:
from data_funcs import combine_nested
rnn_dat_sp = RNNData(
    combine_nested(train), # input dictionary
    scaler="standard",  # data scaling type
    features_list = params['features_list'] # features for predicting outcome
)


rnn_dat_sp.train_test_split(   
    time_fracs = [.8, .1, .1], # Percent of total time steps used for train/val/test
    space_fracs = [.8, .1, .1] # Percent of total timeseries used for train/val/test
)
rnn_dat_sp.scale_data()

rnn_dat_sp.batch_reshape(
    timesteps = params['timesteps'], # Timesteps aka sequence length for RNN input data. 
    batch_size = params['batch_size'] # Number of samples of length timesteps for a single round of grad. descent
)

In [None]:
params.update({
    'loc_batch_reset': rnn_dat_sp.n_seqs # Used to reset hidden state when location changes for a given batch
})

In [None]:
rnn_sp = RNN_LSTM(params)
m_sp, errs = rnn_sp.run_model(rnn_dat_sp)

In [None]:
errs.mean()