# v2.1 exploration trying to make it work better

In [1]:
# Environment
import os
import os.path as osp
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sys
# Local modules
sys.path.append('..')
import reproducibility
import pandas as pd
from utils import print_dict_summary
from data_funcs import rmse, build_train_dict
from moisture_rnn import RNNParams, RNNData, RNN, RNN_LSTM
from moisture_rnn_pkl import pkl2train
from tensorflow.keras.callbacks import Callback
from utils import hash2
import copy
import logging
import pickle
from utils import logging_setup, read_yml, read_pkl, hash_ndarray, hash_weights, str2time
import yaml
import copy

In [2]:
logging_setup()

## Test Data

In [3]:
file_paths = ['data/fmda_nw_202401-05_f05.pkl']

In [4]:
# Params used for data filtering
params_data = read_yml("params_data.yaml") 
params_data

{'max_intp_time': 10,
 'zero_lag_threshold': 10,
 'hours': 720,
 'min_fm': 1,
 'max_fm': 90,
 'min_rain': 0,
 'max_rain': 100,
 'min_wind': 0,
 'max_wind': 35,
 'min_solar': 0,
 'max_solar': 1400,
 'min_soilm': 0,
 'features_all': ['Ed',
  'Ew',
  'solar',
  'wind',
  'elev',
  'lon',
  'lat',
  'soilm',
  'canopyw',
  'groundflux',
  'rain']}

In [5]:
params = read_yml("params.yaml", subkey='rnn') 
params = RNNParams(params)
params.update({'epochs': 200, 
               'learning_rate': 0.001,
               'activation': ['tanh', 'tanh'], # Activation for RNN Layers, Dense layers respectively.
               'recurrent_layers': 2, 'recurrent_units': 30, 
               'dense_layers': 2, 'dense_units': 30,
               'early_stopping_patience': 30, # how many epochs of no validation accuracy gain to wait before stopping
               'batch_schedule_type': 'exp', # Hidden state batch reset schedule
               'bmin': 20, # Lower bound of hidden state batch reset, 
               'bmax': params_data['hours'], # Upper bound of hidden state batch reset, using max hours
               'features_list': ['Ed', 'Ew', 'rain', 'elev', 'lon', 'lat', 'solar', 'wind'],
               'timesteps': 12
              })

Checking params...
Input dictionary passed all checks.
Calculating shape params based on features list, timesteps, and batch size
Input Feature List: ['Ed', 'Ew', 'rain']
Input Timesteps: 12
Input Batch Size: 32
Calculated params:
Number of features: 3
Batch Shape: (32, 12, 3)
{'batch_size': 32, 'timesteps': 12, 'optimizer': 'adam', 'rnn_layers': 1, 'rnn_units': 20, 'dense_layers': 1, 'dense_units': 5, 'activation': ['tanh', 'tanh'], 'dropout': [0.2, 0.2], 'recurrent_dropout': 0.2, 'reset_states': True, 'batch_schedule_type': 'exp', 'bmin': 20, 'bmax': 200, 'epochs': 20, 'learning_rate': 0.001, 'clipvalue': 10.0, 'phys_initialize': False, 'stateful': True, 'verbose_weights': True, 'verbose_fit': False, 'features_list': ['Ed', 'Ew', 'rain'], 'scale': True, 'scaler': 'standard', 'time_fracs': [0.9, 0.05, 0.05], 'early_stopping_patience': 5, 'predict_spinup_hours': 5, 'n_features': 3, 'batch_shape': (32, 12, 3)}
Calculating shape params based on features list, timesteps, and batch size
In

In [None]:
dat = read_pkl(file_paths[0])

In [None]:
import importlib
import data_funcs
importlib.reload(data_funcs)
from data_funcs import build_train_dict

In [6]:
params_data.update({'hours': 3648})

In [44]:
# train = build_train_dict(file_paths, params_data, spatial=False, forecast_step=0)
# train1 = build_train_dict(file_paths, params_data, spatial=False, forecast_step=1, drop_na=False)
# train2 = build_train_dict(file_paths, params_data, spatial=False, forecast_step=2, drop_na=False)
# train3 = build_train_dict(file_paths, params_data, spatial=False, forecast_step=3, drop_na=False)
trainr = build_train_dict(file_paths, params_data, spatial=False, atm_source = "RAWS", drop_na=False)

Atmospheric data source is RAWS, so forecast_step is not used
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Extracting data from input file data/fmda_nw_202401-05_f05.pkl
loading file data/fmda_nw_202401-05_f05.pkl
2024-10-13 11:29:20,411 - INFO - PLFI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-13 11:29:20,540 - INFO - PLFI1 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-10-13 11:29:21,052 - INFO - SADI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-13 11:29:21,169 - INFO - SADI1 RAWS.time_raws time array increments are min 1.0 max 3.0
2024-10-13 11:29:21,689 - INFO - SRFI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-13 11:29:21,806 - INFO - SRFI1 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-10-13 11:29:22,284 - INFO - WEFI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-13 11:29:22,452 - INFO - WEFI1 RAWS.time_raws time array increments are min 1.0 max 2.0
202

In [None]:
train['PLFI1_202401'].keys()

In [None]:
case = "PLFI1_202401"

In [None]:
plt.plot(train[case]['X'][90:110, -1], label = "f00")
plt.plot(train1[case]['X'][90:110, -1], label = "f01")
plt.plot(train2[case]['X'][90:110, -1], label = "f02")
plt.plot(train3[case]['X'][90:110, -1], label = "f03")
plt.plot(trainr[case]['X'][90:110, -1], label = "RAWS")
plt.axvline(13, color = 'k', linestyle='dashed')
plt.legend()

In [None]:
train3[case]['X'][0:5, 5]

In [8]:
def rnn_data_wrap(dict0, params):
    rnn_dat = RNNData(
        dict0, # input dictionary
        scaler="standard",  # data scaling type
        features_list = params['features_list'] # features for predicting outcome
    )
    
    
    rnn_dat.train_test_split(   
        time_fracs = params['time_fracs'], # Percent of total time steps used for train/val/test
        space_fracs = params['space_fracs'] # Percent of total timeseries used for train/val/test
    )
    rnn_dat.scale_data()
    
    rnn_dat.batch_reshape(
        timesteps = params['timesteps'], # Timesteps aka sequence length for RNN input data. 
        batch_size = params['batch_size'], # Number of samples of length timesteps for a single round of grad. descent
        start_times = np.zeros(len(rnn_dat.loc['train_locs']))
    )    
    
    return rnn_dat

In [45]:
params.update({'features_list': ['Ed', 'Ew', 'rain']})

Calculating shape params based on features list, timesteps, and batch size
Input Feature List: ['Ed', 'Ew', 'rain']
Input Timesteps: 12
Input Batch Size: 32
Calculated params:
Number of features: 3
Batch Shape: (32, 12, 3)
{'batch_size': 32, 'timesteps': 12, 'optimizer': 'adam', 'rnn_layers': 1, 'rnn_units': 20, 'dense_layers': 2, 'dense_units': 30, 'activation': ['tanh', 'tanh'], 'dropout': [0.2, 0.2], 'recurrent_dropout': 0.2, 'reset_states': True, 'batch_schedule_type': 'exp', 'bmin': 20, 'bmax': 720, 'epochs': 200, 'learning_rate': 0.001, 'clipvalue': 10.0, 'phys_initialize': False, 'stateful': True, 'verbose_weights': True, 'verbose_fit': False, 'features_list': ['Ed', 'Ew', 'rain'], 'scale': True, 'scaler': 'standard', 'time_fracs': [0.9, 0.05, 0.05], 'early_stopping_patience': 30, 'predict_spinup_hours': 5, 'n_features': 3, 'batch_shape': (32, 12, 3), 'recurrent_layers': 2, 'recurrent_units': 30}


In [46]:
len(trainr.keys())

59

In [47]:
trainr['PLFI1_202401'].keys()

dict_keys(['time', 'X', 'y', 'id', 'case', 'filename', 'loc', 'features_list', 'atm_source', 'forecast_step', 'hours'])

In [48]:
trainr = subset_by_features(trainr, params['features_list'])

Subsetting to cases with features: ['Ed', 'Ew', 'rain']
Removing EACPN_202401 due to missing features
Removing NPJPN_202401 due to missing features
Removing NPPPN_202401 due to missing features
Removing FLZPN_202401 due to missing features
Removing SPRPN_202401 due to missing features
Removing HDRPN_202401 due to missing features
Removing SKYPN_202401 due to missing features
Removing ESCPN_202401 due to missing features
Removing SSMPN_202401 due to missing features
Removing BDRPN_202401 due to missing features
Removing LARPN_202401 due to missing features
Removing ESBPN_202401 due to missing features
Removing SCHPN_202401 due to missing features
Removing ESEPN_202401 due to missing features
Removing MRTPN_202401 due to missing features
Removing KELPN_202401 due to missing features
Removing HOWPN_202401 due to missing features
Removing FWRPN_202401 due to missing features
Removing MTSPN_202401 due to missing features
Removing GDNPN_202401 due to missing features
Removing MDWPN_202401 du

In [49]:
len(trainr.keys())

14

In [50]:
import importlib
import data_funcs
importlib.reload(data_funcs)
from data_funcs import combine_nested, subset_by_features

In [51]:
d = combine_nested(trainr)

In [52]:
d['features_list']

['doy',
 'hod',
 'elev',
 'lon',
 'lat',
 'temp',
 'rh',
 'wind',
 'solar',
 'Ed',
 'Ew',
 'rain']

In [53]:
params['features_list']

['Ed', 'Ew', 'rain']

In [55]:
rnn_dat = RNNData(
    d, # input dictionary
    scaler="standard",  # data scaling type
    features_list = ['Ed', 'rain'] # features for predicting outcome
)


rnn_dat.train_test_split(   
    time_fracs = [.8, .1, .1], # Percent of total time steps used for train/val/test
    space_fracs = [.8, .1, .1] # Percent of total timeseries used for train/val/test
)
rnn_dat.scale_data()

rnn_dat.batch_reshape(
    timesteps = params['timesteps'], # Timesteps aka sequence length for RNN input data. 
    batch_size = params['batch_size'], # Number of samples of length timesteps for a single round of grad. descent
    start_times = np.zeros(len(rnn_dat.loc['train_locs']))
)    

Input data from multiple timeseries.
Setting data scaler: standard
Setting features_list to ['Ed', 'rain']. 
  NOTE: not subsetting features yet. That happens in train_test_split.
Subsetting input data to features_list: ['Ed', 'rain']
Train index: 0 to 2918
Validation index: 2918 to 3283
Test index: 3283 to 3648
Subsetting locations into train/val/test
Total Locations: 14
Train Locations: 11
Val. Locations: 1
Test Locations: 2
X_train[0] shape: (2918, 2), y_train[0] shape: (2918, 1)
X_val[0] shape: (365, 2), y_val[0] shape: (365, 1)
X_test[0] shape: (365, 2), y_test[0] shape: (365, 1)
Scaling training data with scaler StandardScaler(), fitting on X_train
Reshaping spatial training data using batch size: 32 and timesteps: 12
Setting total hours to minimum length of y in provided dictionary
Reshaping validation data using batch size: 32 and timesteps: 12
Setting total hours to minimum length of y in provided dictionary
Reshaping test data by stacking. Output dimension will be (n_locs, te

## Test Other ML

In [None]:
params = read_yml("params.yaml", subkey='xgb')
params

In [None]:
dat = read_pkl("data/train.pkl")

In [None]:
cases = [*dat.keys()]

In [None]:
rnn_dat = RNNData(dat[cases[10]], params['scaler'], params['features_list'])
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat.scale_data()

In [None]:
from moisture_models import XGB, RF, LM

In [None]:
mod = XGB(params)

In [None]:
mod.params

In [None]:
mod.fit(rnn_dat.X_train, rnn_dat.y_train)

In [None]:
preds = mod.predict(rnn_dat.X_test)

In [None]:
rmse(preds, rnn_dat.y_test)

In [None]:
plt.plot(rnn_dat.y_test)
plt.plot(preds)

In [None]:
params = read_yml("params.yaml", subkey='rf')
rnn_dat = RNNData(dat[cases[10]], features_list = ['Ed', 'Ew', 'solar', 'wind', 'rain'])
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)

In [None]:
import importlib
import moisture_models
importlib.reload(moisture_models)

In [None]:
params

In [None]:
mod2 = RF(params)
mod2.fit(rnn_dat.X_train, rnn_dat.y_train.flatten())
preds2 = mod2.predict(rnn_dat.X_test)
print(rmse(preds2, rnn_dat.y_test.flatten()))
plt.plot(rnn_dat.y_test)
plt.plot(preds2)

In [None]:
from moisture_models import RF
mod2 = RF(params)

In [None]:
params = read_yml("params.yaml", subkey='lm')
rnn_dat = RNNData(dat[cases[10]], features_list = ['Ed', 'Ew', 'solar', 'wind', 'rain'])
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)
mod = LM(params)

In [None]:
mod.fit(rnn_dat.X_train, rnn_dat.y_train)
preds = mod.predict(rnn_dat.X_test)
print(rmse(preds2, rnn_dat.y_test.flatten()))

## RNN

## Phys Initialized

In [None]:
def rnn_data_wrap(dict0, params):
    rnn_dat = RNNData(
        dict0, # input dictionary
        scaler="standard",  # data scaling type
        features_list = params['features_list'] # features for predicting outcome
    )
    
    
    rnn_dat.train_test_split(   
        time_fracs = params['time_fracs'], # Percent of total time steps used for train/val/test
        space_fracs = params['space_fracs'] # Percent of total timeseries used for train/val/test
    )
    rnn_dat.scale_data()
    
    rnn_dat.batch_reshape(
        timesteps = params['timesteps'], # Timesteps aka sequence length for RNN input data. 
        batch_size = params['batch_size'], # Number of samples of length timesteps for a single round of grad. descent
        start_times = np.zeros(len(rnn_dat.loc['train_locs']))
    )    
    
    return rnn_dat

In [None]:
params.update({
    'epochs':100,
    'dense_layers': 0,
    'activation': ['relu', 'relu'],
    'phys_initialize': False,
    'dropout': [0,0],
    'space_fracs': [.8, .1, .1]
})

In [None]:
rnn_dat = rnn_data_wrap(combine_nested(train3), params)

In [None]:
reproducibility.set_seed()
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat)

In [None]:
rnn.model_train.summary()

In [None]:
params.update({
    'phys_initialize': True,
    'scaler': None, # TODO
    'dense_layers': 0, # NOT including single Dense output layer which is hard-coded
    'activation': ['linear', 'linear'], # TODO tanh, relu the same
    'batch_schedule_type': None # Hopefully this isn't a necessity like before, but maybe it will help
})

In [None]:
rnn_dat2 = rnn_data_wrap(combine_nested(train3), params)
rnn_dat2.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat2.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN, RNNData

In [None]:
reproducibility.set_seed()

rnn = RNN(params)

In [None]:
m, errs = rnn.run_model(rnn_dat2)

In [None]:
rnn.model_predict.get_weights()

In [None]:
params['rnn_units']

In [None]:
params.update({
    'phys_initialize': True,
    'scaler': None, # TODO
    'dense_layers': 0, # NOT including single Dense output layer which is hard-coded
    'activation': ['relu', 'relu'], # TODO tanh, relu the same
    'batch_schedule_type': None # Hopefully this isn't a necessity like before, but maybe it will help
})

In [None]:
rnn_dat2 = RNNData(dat[cases[10]], params['scaler'], params['features_list'])
rnn_dat2.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat2.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
reproducibility.set_seed()

rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat2)

## LSTM

TODO: FIX BELOW

In [None]:
import importlib 
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN_LSTM

In [None]:
params = read_yml("params.yaml", subkey="lstm")
params = RNNParams(params)

In [None]:
rnn_dat = rnn_data_wrap(combine_nested(train3), params)
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat.scale_data()
rnn_dat.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
from moisture_rnn import ResetStatesCallback, EarlyStoppingCallback
params.update({'epochs': 50, 'learning_rate': 0.001, 'verbose_fit': True, 'rnn_layers': 2, 'rnn_units': 20, 'dense_layers': 1, 'dense_units': 10,
              'activation': ['tanh', 'tanh'], 'features_list': rnn_dat.features_list,
              'batch_schedule_type':'exp', 'bmin': 10, 'bmax':rnn_dat.hours})
reproducibility.set_seed(123)
lstm = RNN_LSTM(params)

history = lstm.model_train.fit(rnn_dat.X_train, rnn_dat.y_train, 
                    batch_size = params['batch_size'], epochs=params['epochs'], 
                    callbacks = [ResetStatesCallback(params),
                                EarlyStoppingCallback(patience = 15)],
                   validation_data = (rnn_dat.X_val, rnn_dat.y_val))
              

In [None]:
params.update({'epochs': 50, 'learning_rate': 0.001, 'verbose_fit': True, 'rnn_layers': 2, 'rnn_units': 20, 'dense_layers': 1, 'dense_units': 10,
              'activation': ['tanh', 'tanh'], 'features_list': rnn_dat.features_list,
              'batch_schedule_type':'exp', 'bmin': 10, 'bmax':rnn_dat.hours,
              'early_stopping_patience': 25})
reproducibility.set_seed(123)
lstm = RNN_LSTM(params)
m, errs = lstm.run_model(rnn_dat)

In [None]:
rnn_dat.spatial

In [None]:
params = RNNParams(read_yml("params.yaml", subkey='lstm'))
params

In [None]:
train = read_pkl("data/train.pkl")

In [None]:
from itertools import islice
train = {k: train[k] for k in islice(train, 100)}

In [None]:
from data_funcs import combine_nested
rnn_dat_sp = RNNData(
    combine_nested(train), # input dictionary
    scaler="standard",  # data scaling type
    features_list = params['features_list'] # features for predicting outcome
)


rnn_dat_sp.train_test_split(   
    time_fracs = [.8, .1, .1], # Percent of total time steps used for train/val/test
    space_fracs = [.8, .1, .1] # Percent of total timeseries used for train/val/test
)
rnn_dat_sp.scale_data()

rnn_dat_sp.batch_reshape(
    timesteps = params['timesteps'], # Timesteps aka sequence length for RNN input data. 
    batch_size = params['batch_size'] # Number of samples of length timesteps for a single round of grad. descent
)

In [None]:
params.update({
    'loc_batch_reset': rnn_dat_sp.n_seqs # Used to reset hidden state when location changes for a given batch
})

In [None]:
reproducibility.set_seed()
rnn_sp = RNN_LSTM(params)
m_sp, errs = rnn_sp.run_model(rnn_dat_sp)

In [None]:
errs.mean()