# v2.1 exploration trying to make it work better

In [1]:
# Environment
import os
import os.path as osp
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sys
# Local modules
sys.path.append('..')
import reproducibility
import pandas as pd
from utils import print_dict_summary
from data_funcs import rmse
from moisture_rnn import RNNParams, RNNData, RNN, RNN_LSTM
from moisture_rnn_pkl import pkl2train
from tensorflow.keras.callbacks import Callback
from utils import hash2
import copy
import logging
import pickle
from utils import logging_setup, read_yml, read_pkl, hash_ndarray, hash_weights
import yaml
import copy

In [2]:
logging_setup()

## Test Learning Schedule

In [None]:
train = read_pkl('train.pkl')
train.keys()

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN, RNNData

In [None]:
params = read_yml("params.yaml", subkey="rnn")
params = RNNParams(params)

In [None]:
rnn_dat = RNNData(train['PLFI1_202401'], scaler=params['scaler'], features_list = params['features_list'])

In [None]:
rnn_dat.train_test_split(
    time_fracs = [.9, .05, .05]
)
rnn_dat.scale_data()
rnn_dat.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
rnn_dat.print_hashes()

In [None]:
reproducibility.set_seed()
params.update({'batch_schedule_type': 'constant', 'bmin': 20})
params.update({'verbose_fit': True})
params.update({
    'train_frac': .9,
    'val_frac': .05,
    'activation': ['relu', 'relu'],
    'epochs': 10,
    'dense_units': 10,
    'rnn_layers': 2       
})
rnn = RNN(params)
m, errs, best = rnn.run_model(rnn_dat, plot_period="predict", return_epochs=True)

In [None]:
from moisture_models import run_augmented_kf
print('Running Augmented KF')
train["PLFI1_202401"]['h2'] = rnn_dat.test_ind
train["PLFI1_202401"]['scale_fm'] = 1
m_kf, Ec = run_augmented_kf(train["PLFI1_202401"])
y = rnn_dat['y']         
train["PLFI1_202401"]['m'] = m
print(f"KF RMSE: {rmse(m,y)}")

In [None]:
data_params = read_yml("params_data.yaml")
data_params.update({
    'hours': 720
})
from data_funcs import process_train_dict
train2 = process_train_dict("data/fmda_nw_202401-05_f05.pkl", atm_dict="RAWS", data_params=data_params, verbose=True)

In [None]:
train2.keys()

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN, RNNData

In [None]:
rnn_dat2 = RNNData(train2['PLFI1_202401'], scaler=params['scaler'], features_list = params['features_list'])
rnn_dat2.train_test_split(
    time_fracs = [.9, .05, .05]
)
rnn_dat2.scale_data()
rnn_dat2.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'],
                      start_times="zero", verbose=False)

In [None]:
rnn_dat2.X_val.shape

In [None]:
reproducibility.set_seed()
params.update({'batch_schedule_type': 'exp', 'bmin': 20, 'bmax': rnn_dat2.hours, 'epochs': 500,
              'early_stopping_patience': 20, 'rnn_layers':2, 'dense_layers': 2, 'dense_units': 50, 'activation': ['relu', 'relu']})
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat2)

In [None]:
rnn.model_predict.summary()

In [None]:
rnn_dat2 = RNNData(train2['ESEPN_202401'], scaler=params['scaler'], features_list = params['features_list'])
rnn_dat2.train_test_split(
    time_fracs = [.9, .05, .05]
)
rnn_dat2.scale_data()
rnn_dat2.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])


In [None]:
reproducibility.set_seed()
params.update({'batch_schedule_type': 'constant', 'bmin': 20})
params.update({'verbose_fit': True})
params.update({
    'train_frac': .9,
    'val_frac': .05,
    'activation': ['relu', 'relu'],
    'epochs': 10,
    'dense_units': 10,
    'rnn_layers': 2       
})
rnn2 = RNN(params)
m2, errs2, best = rnn2.run_model(rnn_dat2, plot_period="predict", return_epochs=True)

## Test Spatial Data

In [None]:
train = read_pkl('data/train.pkl')

In [None]:
params = read_yml("params.yaml", subkey="rnn")
params = RNNParams(params)

In [None]:
len(train.keys())

In [None]:
from itertools import islice
train2_cases = ['PLFI1_202401', 'SRFI1_202401', 'BGFO3_202401', 'CAFO3_202401', 'TPEO3_202401', 'CGFO3_202401', 'CWFO3_202401', 'HYFO3_202401', 'BBFO3_202401', 'KMRO3_202401', 'LPOW1_202401', 'TWRW1_202401', 'BHRO3_202401', 'EACPN_202401', 'NPJPN_202401', 'NPPPN_202401', 'FLZPN_202401', 'SPRPN_202401', 'HDRPN_202401', 'SKYPN_202401', 'ESCPN_202401', 'SSMPN_202401', 'BDRPN_202401', 'LARPN_202401', 'ESBPN_202401', 'SCHPN_202401', 'ESEPN_202401', 'MRTPN_202401', 'KELPN_202401', 'HOWPN_202401', 'FWRPN_202401', 'MTSPN_202401', 'GDNPN_202401', 'MDWPN_202401', 'KNRPN_202401', 'QRDPN_202401', 'PTVPN_202401', 'GOSPN_202401', 'ANDPN_202401', 'THMPN_202401', 'OXDPN_202401', 'GRUPN_202401', 'HILPN_202401', 'BLUPN_202401', 'LAMPN_202401', 'NWFPN_202401', 'SHDPN_202401', 'TNCPN_202401', 'RCKPN_202401', 'ZENPN_202401', 'SMVPN_202401', 'SWBPN_202401', 'BPLPN_202401', 'TT803_202401', 'SRRPN_202401', 'PCLPN_202401', 'BLFPN_202401', 'PWLPN_202401', 'PVRPN_202401']
dat = {key: train[key] for key in train2_cases if key in train}

In [None]:
dat.keys()

In [None]:
from data_funcs import combine_nested
dd = combine_nested(dat)

In [None]:
import importlib
import utils
importlib.reload(utils)
from utils import Dict

In [None]:
dd = Dict(dd)

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNNData

In [None]:
rnn_dat = RNNData(dd, scaler="standard", 
                  features_list = ['Ed', 'Ew', 'rain', 'elev', 'lon', 'lat'])

In [None]:
# rnn_dat.train_test_split(   
#     time_fracs = [.9, .05, .05],
#     space_fracs = [.9, .05, .05]
# )
rnn_dat.train_test_split(   
    time_fracs = [.9, .05, .05],
    space_fracs = [.8, .1, .1]
)

In [None]:
rnn_dat.scale_data()

In [None]:
print(params['batch_size'])

In [None]:
rnn_dat.batch_reshape(
    timesteps = params['timesteps'], 
    batch_size = params['batch_size'],
    verbose=True
)

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN

In [None]:
from moisture_rnn import ResetStatesCallback, EarlyStoppingCallback
params.update({'epochs': 25, 'learning_rate': 0.0001, 'verbose_fit': False, 'rnn_layers': 2, 'rnn_units': 20, 'dense_layers': 1, 'dense_units': 10,
              'activation': ['relu', 'relu'],
              'features_list': rnn_dat.features_list})
params.update({'batch_schedule_type': 'exp', 'bmin': 20, 'bmax': rnn_dat.hours})
reproducibility.set_seed(123)
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat)

In [None]:
errs.mean()

In [3]:
import importlib
import data_funcs
importlib.reload(data_funcs)
from data_funcs import process_train_dict

In [6]:
from data_funcs import process_train_dict
data_params = read_yml("params_data.yaml")
data_params.update({
    'hours': 168
})
train2 = process_train_dict("data/test_CA_202401.pkl", data_params=data_params, verbose=True)

2024-09-18 10:16:06,741 - INFO - CNFC1_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:16:06,745 - INFO - CNFC1_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:16:06,747 - INFO - CRVC1_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:16:06,747 - INFO - CRVC1_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:16:06,747 - INFO - FCHC1_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:16:06,757 - INFO - FCHC1_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:16:06,761 - INFO - FTNC1_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:16:06,762 - INFO - FTNC1_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:16:06,762 - INFO - HTRC1_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:16:06,762 - INFO - HTRC1_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:16:06,773 -

In [8]:
len(train2.keys())

97

In [9]:

data_params = read_yml("params_data.yaml")
data_params.update({
    'hours': 168
})
train3 = process_train_dict("data/test_NW_202401.pkl", data_params=data_params, verbose=True)

2024-09-18 10:18:22,784 - INFO - LGFO3_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:18:22,784 - INFO - LGFO3_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:18:22,797 - INFO - PYFO3_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:18:22,798 - INFO - PYFO3_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:18:22,802 - INFO - RXFO3_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:18:22,802 - INFO - RXFO3_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:18:22,809 - INFO - WMFO3_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:18:22,812 - INFO - WMFO3_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:18:22,815 - INFO - CMFW1_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:18:22,817 - INFO - CMFW1_202401 RAWS.time_raws time array increments are 1.0 hours
2024-09-18 10:18:22,820 - INFO 

In [12]:
train_combined = {}

In [13]:
train_combined.update(train2)
train_combined.update(train3)

In [21]:
train_combined.keys()

dict_keys(['CNFC1_202401', 'CRVC1_202401', 'FCHC1_202401', 'FTNC1_202401', 'HTRC1_202401', 'KRNC1_202401', 'MPOC1_202401', 'MKEC1_202401', 'MCFC1_202401', 'PRGC1_202401', 'SHQC1_202401', 'WVTC1_202401', 'WWRC1_202401', 'KYCN2_202401', 'MTSN2_202401', 'BKRC1_202401', 'JSNC1_202401', 'MTQC1_202401', 'NFRC1_202401', 'OORC1_202401', 'UHLC1_202401', 'PHRC1_202401', 'WALC1_202401', 'BSNC1_202401', 'CGVC1_202401', 'DKYC1_202401', 'FNWC1_202401', 'PRHC1_202401', 'MMTC1_202401', 'TR172_202401', 'OVRC1_202401', 'RCEC1_202401', 'SHVC1_202401', 'TRMC1_202401', 'MIAC1_202401', 'HSQC1_202401', 'MOLC1_202401', 'DEMC1_202401', 'DWRN2_202401', 'JSDC1_202401', 'BPOC1_202401', 'GALN2_202401', 'YCGN2_202401', 'PIVC1_202401', 'TS566_202401', 'TSHC1_202401', 'PEPC1_202401', 'EPWC1_202401', 'WWNC1_202401', 'TMNC1_202401', 'DEXC1_202401', 'TWMC1_202401', 'WOCC1_202401', 'KNXN2_202401', 'SKYN2_202401', 'TT336_202401', 'MNMC1_202401', 'LIB03_202401', 'LIB05_202401', 'LIB06_202401', 'TT625_202401', 'NV001_202401

In [26]:
import importlib
import data_funcs
importlib.reload(data_funcs)
from data_funcs import process_train_dict

In [27]:
d = process_train_dict(["data/test_CA_202401.pkl", "data/test_NW_202401.pkl"], data_params=data_params)

2024-09-18 10:24:25,849 - INFO - CNFC1_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:24:25,849 - INFO - CNFC1_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:24:25,857 - INFO - CRVC1_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:24:25,859 - INFO - CRVC1_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:24:25,865 - INFO - FCHC1_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:24:25,865 - INFO - FCHC1_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:24:25,872 - INFO - FTNC1_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:24:25,874 - INFO - FTNC1_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:24:25,878 - INFO - HTRC1_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:24:25,880 - INFO - HTRC1_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:24:25,884 -

In [29]:
len(d.keys())

174

In [30]:
from moisture_rnn_pkl import pkl2train

In [31]:
d2 = pkl2train(["data/test_CA_202401.pkl", "data/test_NW_202401.pkl"])

2024-09-18 10:25:25,817 - INFO - Using data from step f01
2024-09-18 10:25:25,818 - INFO - Using rain as the difference of accumulated precipitation between f01 and f00
2024-09-18 10:25:25,819 - INFO - loading file data/test_CA_202401.pkl
2024-09-18 10:25:25,848 - INFO - Processing subdictionary CNFC1_202401
2024-09-18 10:25:25,851 - INFO - CNFC1_202401 HRRR.time time array increments are 1.0 hours
2024-09-18 10:25:25,852 - INFO - HRRR increment is 1.0 h
2024-09-18 10:25:25,855 - INFO - CNFC1_202401 rain as difference f01 minus f00: min 0.0 max 6.232508517866318
2024-09-18 10:25:25,856 - INFO - Created feature matrix train[CNFC1_202401]['X'] shape (168, 8)
2024-09-18 10:25:25,857 - INFO - CNFC1_202401 RAWS.time_raws length is 168
2024-09-18 10:25:25,864 - INFO - CNFC1_202401 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-09-18 10:25:25,865 - INFO - CNFC1_202401 RAWS.fm length is 168
2024-09-18 10:25:25,866 - INFO - Created target matrix train[CNFC1_202401]['y'] shape (16

In [32]:
len(d2.keys())

219

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNNData

In [None]:
from itertools import islice
from data_funcs import combine_nested
from utils import Dict

params = read_yml("params.yaml", subkey="rnn")
params = RNNParams(params)

In [None]:
dat = {k: train2[k] for k in islice(train2, 100)}
dd = combine_nested(dat)
dd = Dict(dd)
rnn_dat2 = RNNData(dd, scaler="standard", 
                  features_list = ['Ed', 'Ew', 'rain', 'elev', 'lon', 'lat'])
rnn_dat2.train_test_split(   
    time_fracs = [.9, .05, .05],
    space_fracs = [.8, .1, .1]
)
params.update({'batch_size': 32})
rnn_dat2.batch_reshape(
    timesteps = params['timesteps'], 
    batch_size = params['batch_size'],
    start_times = np.zeros(len(rnn_dat2.case)).astype(int),
    verbose=False
)

In [None]:
rnn_dat2.X_train.shape

In [None]:
rnn_dat2.X_val.shape

In [None]:
rnn_dat2.X_test[0].shape

In [None]:
params.update({'epochs': 25, 'learning_rate': 0.0001, 'verbose_fit': False, 'rnn_layers': 2, 'rnn_units': 20, 'dense_layers': 1, 'dense_units': 10,
              'activation': ['relu', 'relu'], 'epochs': 100, 'early_stopping_patience': 20,
              'features_list': rnn_dat2.features_list})
params.update({'batch_schedule_type': 'exp', 'bmin': 20, 'bmax': rnn_dat2.hours})
reproducibility.set_seed(123)
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat2, plot_period='predict')

In [None]:
errs.mean()

In [None]:
from data_funcs import process_train_dict
data_params = read_yml("params_data.yaml")
data_params.update({
    'hours': 720
})
train = process_train_dict("data/fmda_nw_202401-05_f05.pkl", data_params=data_params, verbose=True)

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNNData

In [None]:
dat = {k: train[k] for k in islice(train, 100)}
dd = combine_nested(dat)
dd = Dict(dd)
rnn_dat = RNNData(dd, scaler="standard", 
                  features_list = ['Ed', 'Ew', 'rain', 'elev', 'lon', 'lat'])
rnn_dat.train_test_split(   
    time_fracs = [.9, .05, .05],
    space_fracs = [.8, .1, .1]
)
params.update({'batch_size': 32})
rnn_dat.batch_reshape(
    timesteps = params['timesteps'], 
    batch_size = params['batch_size'],
    start_times = np.zeros(len(rnn_dat.case)).astype(int),
    verbose=False
)

In [None]:
rnn_dat.X_train.shape

In [None]:
rnn_dat.X_val.shape

In [None]:
params.update({'learning_rate': 0.001, 'verbose_fit': True, 'rnn_layers': 1, 'rnn_units': 20, 'dense_layers': 1, 'dense_units': 10,
              'activation': ['relu', 'relu'], 'epochs': 100, 'early_stopping_patience': 20,
              'features_list': rnn_dat.features_list})
params.update({'batch_schedule_type': 'exp', 'bmin': 20, 'bmax': rnn_dat.hours})
reproducibility.set_seed(123)
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat, plot_period='predict')

In [None]:
errs.mean()

In [None]:
from moisture_models import run_augmented_kf

In [None]:
test_cases = rnn_dat.loc['test_locs']
rmses = []
for case in test_cases:
    print("~"*50)
    print(case)
    ind = rnn_dat.case.index(case)
    d = train[case]
    d['hours']=720
    d['h2'] = rnn_dat.test_ind
    d['scale_fm'] = 1
    m, Ec = run_augmented_kf(d)
    y = rnn_dat['y'][ind]
    print(f"KF RMSE: {rmse(m[rnn_dat.test_ind:],y[rnn_dat.test_ind:])}") 
    rmses.append(rmse(m[rnn_dat.test_ind:],y[rnn_dat.test_ind:]))

In [None]:
np.array(rmses).mean()

## LSTM

TODO: FIX BELOW

In [None]:
import importlib 
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN_LSTM

In [None]:
params = read_yml("params.yaml", subkey="lstm")
params = RNNParams(params)

In [None]:
from moisture_rnn import ResetStatesCallback, EarlyStoppingCallback
params.update({'epochs': 50, 'learning_rate': 0.00005, 'verbose_fit': True, 'rnn_layers': 2, 'rnn_units': 20, 'dense_layers': 1, 'dense_units': 10,
              'activation': ['relu', 'relu'], 'features_list': rnn_dat.features_list,
              'batch_schedule_type':'exp', 'bmin': 10, 'bmax':rnn_dat.hours})
reproducibility.set_seed(123)
lstm = RNN_LSTM(params)

history = lstm.model_train.fit(rnn_dat.X_train, rnn_dat.y_train, 
                    batch_size = params['batch_size'], epochs=params['epochs'], 
                    callbacks = [ResetStatesCallback(params),
                                EarlyStoppingCallback(patience = 15)],
                   validation_data = (rnn_dat.X_val, rnn_dat.y_val))
              