# v2.1 exploration trying to make it work better

In [46]:
# Environment
import os
import os.path as osp
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sys
# Local modules
sys.path.append('..')
import reproducibility
import pandas as pd
from utils import print_dict_summary
from data_funcs import rmse, build_train_dict
from moisture_rnn import RNNParams, RNNData, RNN, RNN_LSTM
from moisture_rnn_pkl import pkl2train
from tensorflow.keras.callbacks import Callback
from utils import hash2
import copy
import logging
import pickle
from utils import logging_setup, read_yml, read_pkl, hash_ndarray, hash_weights, str2time
import yaml
import copy

In [4]:
logging_setup()

## Test Data

In [5]:
file_paths = ['data/fmda_nw_202401-05_f05.pkl']

In [6]:
# Params used for data filtering
params_data = read_yml("params_data.yaml") 
params_data

{'max_intp_time': 10,
 'zero_lag_threshold': 10,
 'hours': 720,
 'min_fm': 1,
 'max_fm': 90,
 'min_rain': 0,
 'max_rain': 100,
 'min_wind': 0,
 'max_wind': 35,
 'min_solar': 0,
 'max_solar': 1400,
 'min_soilm': 0,
 'features_all': ['Ed',
  'Ew',
  'solar',
  'wind',
  'elev',
  'lon',
  'lat',
  'soilm',
  'canopyw',
  'groundflux',
  'rain']}

In [7]:
params = read_yml("params.yaml", subkey='rnn') 
params = RNNParams(params)
params.update({'epochs': 200, 
               'learning_rate': 0.001,
               'activation': ['tanh', 'tanh'], # Activation for RNN Layers, Dense layers respectively.
               'recurrent_layers': 2, 'recurrent_units': 30, 
               'dense_layers': 2, 'dense_units': 30,
               'early_stopping_patience': 30, # how many epochs of no validation accuracy gain to wait before stopping
               'batch_schedule_type': 'exp', # Hidden state batch reset schedule
               'bmin': 20, # Lower bound of hidden state batch reset, 
               'bmax': params_data['hours'], # Upper bound of hidden state batch reset, using max hours
               'features_list': ['Ed', 'Ew', 'rain', 'elev', 'lon', 'lat', 'solar', 'wind'],
               'timesteps': 12
              })

Checking params...
Input dictionary passed all checks.
Calculating shape params based on features list, timesteps, and batch size
Input Feature List: ['Ed', 'Ew', 'rain']
Input Timesteps: 12
Input Batch Size: 32
Calculated params:
Number of features: 3
Batch Shape: (32, 12, 3)
{'batch_size': 32, 'timesteps': 12, 'optimizer': 'adam', 'rnn_layers': 1, 'rnn_units': 20, 'dense_layers': 1, 'dense_units': 5, 'activation': ['tanh', 'tanh'], 'dropout': [0.2, 0.2], 'recurrent_dropout': 0.2, 'reset_states': True, 'batch_schedule_type': 'exp', 'bmin': 20, 'bmax': 200, 'epochs': 20, 'learning_rate': 0.001, 'clipvalue': 10.0, 'phys_initialize': False, 'stateful': True, 'verbose_weights': True, 'verbose_fit': False, 'features_list': ['Ed', 'Ew', 'rain'], 'scale': True, 'scaler': 'standard', 'time_fracs': [0.9, 0.05, 0.05], 'early_stopping_patience': 5, 'predict_spinup_hours': 5, 'n_features': 3, 'batch_shape': (32, 12, 3)}
Calculating shape params based on features list, timesteps, and batch size
In

In [9]:
dat = read_pkl(file_paths[0])

loading file data/fmda_nw_202401-05_f05.pkl


In [110]:
import importlib
import data_funcs
importlib.reload(data_funcs)
from data_funcs import build_train_dict

In [111]:
params_data.update({'hours': 3648})

In [112]:
train = build_train_dict(file_paths, params_data, spatial=False, forecast_step=1)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Extracting data from input file data/fmda_nw_202401-05_f05.pkl
loading file data/fmda_nw_202401-05_f05.pkl
2024-10-09 11:28:28,527 - INFO - PLFI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-09 11:28:28,574 - INFO - PLFI1 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-10-09 11:28:28,724 - INFO - SADI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-09 11:28:28,774 - INFO - SADI1 RAWS.time_raws time array increments are min 1.0 max 3.0
2024-10-09 11:28:28,936 - INFO - SRFI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-09 11:28:28,992 - INFO - SRFI1 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-10-09 11:28:29,148 - INFO - WEFI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-09 11:28:29,194 - INFO - WEFI1 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-10-09 11:28:29,349 - INFO - AGFO3_202401 HRRR.time time arra

In [113]:
case = "PLFI1_202401"

In [114]:
train[case].keys()

dict_keys(['time', 'X', 'y', 'id', 'case', 'filename', 'loc', 'features_list', 'atm_source', 'hours'])

In [115]:
dat[case].keys()

dict_keys(['loc', 'RAWS', 'HRRR'])

In [116]:
len(train[case]['time'])

3648

In [117]:
len(dat[case]['HRRR']['time'])

3648

In [118]:
train[case]['features_list']

['doy',
 'hod',
 'elev',
 'lon',
 'lat',
 'temp',
 'rh',
 'wind',
 'solar',
 'soilm',
 'canopyw',
 'groundflux',
 'Ed',
 'Ew',
 'rain']

In [119]:
train[case]['X'][100:120, -1]

array([0.0017435 , 0.        , 0.02438276, 0.04304449, 0.00700528,
       0.00576424, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

In [120]:
len(train[case]['X'][:, -1])

3648

In [121]:
train[case]['X'][100, -1]

0.001743497113644327

In [122]:
train[case]['time'][100]

datetime.datetime(2024, 1, 5, 4, 0)

In [123]:
(dat[case]['HRRR']['f01']['precip_accum'] - dat[case]['HRRR']['f00']['precip_accum'])[100]

0.001743497113644327

In [124]:
str2time(dat[case]['HRRR']['time'])[100]

datetime.datetime(2024, 1, 5, 4, 0)

In [125]:
train[case]['features_list'][-2]

'Ew'

In [126]:
train[case]['X'][100, -2]

18.552369470924507

In [127]:
dat[case]['HRRR']['f01']['Ew'][100]

18.552369470924507

In [129]:
import importlib
import data_funcs
importlib.reload(data_funcs)
from data_funcs import build_train_dict

In [130]:
train3 = build_train_dict(file_paths, params_data, spatial=False, forecast_step=3)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Extracting data from input file data/fmda_nw_202401-05_f05.pkl
loading file data/fmda_nw_202401-05_f05.pkl
2024-10-09 11:29:47,717 - INFO - PLFI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-09 11:29:47,765 - INFO - PLFI1 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-10-09 11:29:47,914 - INFO - SADI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-09 11:29:47,958 - INFO - SADI1 RAWS.time_raws time array increments are min 1.0 max 3.0
2024-10-09 11:29:48,104 - INFO - SRFI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-09 11:29:48,152 - INFO - SRFI1 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-10-09 11:29:48,297 - INFO - WEFI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-09 11:29:48,349 - INFO - WEFI1 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-10-09 11:29:48,512 - INFO - AGFO3_202401 HRRR.time time arra

In [131]:
train3[case]['X'].shape[0]

3648

In [132]:
train3[case]['X'][100, -1]

0.7219795919548502

In [133]:
(dat[case]['HRRR']['f03']['precip_accum'] - dat[case]['HRRR']['f00']['precip_accum'])[100]

0.7219795919548502

In [134]:
dat[case]['RAWS']['rain'][100:110]

array([0.   , 0.   , 0.762, 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ,
       0.   ])

In [135]:
from data_funcs import get_hrrr_atm

In [139]:
h, hnames = get_hrrr_atm(dat[case], 'f03')

In [141]:
hnames

['temp', 'rh', 'wind', 'solar', 'soilm', 'canopyw', 'groundflux', 'Ed', 'Ew']

In [144]:
h[-1].mean()

15.494270544356842

In [145]:
train[case]['features_list']

['doy',
 'hod',
 'elev',
 'lon',
 'lat',
 'temp',
 'rh',
 'wind',
 'solar',
 'soilm',
 'canopyw',
 'groundflux',
 'Ed',
 'Ew',
 'rain']

In [150]:
np.mean(train3[case]['X'][:, -2])

15.494270544356842

In [182]:
v = h[-1]

In [152]:
len(v)

3648

In [153]:
type(v)

numpy.ndarray

In [154]:
v.shape

(3648,)

In [189]:
def shift_time(v, forecast_step):
    """
    Shifts the values of a numpy array forward by a specified number of steps.

    Parameters:
    ----------
    v : numpy.ndarray
        The input array to be shifted.
    forecast_step : int
        The number of positions to shift the array forward.

    Returns:
    -------
    numpy.ndarray
        A new array of the same length as `v`, with values shifted forward by 
        `forecast_step` and the leading positions filled with NaN.

    Example:
    -------
    >>> v = np.array([1, 2, 3, 4, 5])
    >>> shift_time(v, 2)
    array([nan, nan,  1.,  2.,  3.])
    """
    shifted = np.full(v.shape, np.nan, dtype=float)
    shifted[forecast_step:] = v[:-forecast_step]
    return shifted

In [190]:
len(h)

9

In [226]:
from data_funcs import build_features_single
X, names = build_features_single(dat[case], atm="HRRR", fstep="f00", fprev = "f00")

In [227]:
X.shape

(3648, 15)

In [228]:
len(names)

15

In [229]:
names

['doy',
 'hod',
 'elev',
 'lon',
 'lat',
 'temp',
 'rh',
 'wind',
 'solar',
 'soilm',
 'canopyw',
 'groundflux',
 'Ed',
 'Ew',
 'rain']

In [232]:
X[:, -1].max()

0.0

In [197]:
hnames

['temp', 'rh', 'wind', 'solar', 'soilm', 'canopyw', 'groundflux', 'Ed', 'Ew']

In [199]:
indices_to_shift = [names.index(item) for item in hnames]

In [200]:
indices_to_shift

[5, 6, 7, 8, 9, 10, 11, 12, 13]

In [201]:
v

array([18.7451731 , 18.69869469, 19.68633807, ...,  5.30676972,
        5.42227214,  6.30477207])

In [204]:
forecast_step=3

In [210]:
# Time Shift
shifted_arr = X.astype(float).copy()
shifted_arr[:forecast_step, indices_to_shift] = np.nan
shifted_arr[forecast_step:, indices_to_shift] = X[:-forecast_step, indices_to_shift]

In [211]:
shifted_arr.shape

(3648, 15)

In [218]:
shifted_arr[0:5, 14]

array([0., 0., 0., 0., 0.])

In [273]:
import importlib
import data_funcs
importlib.reload(data_funcs)
from data_funcs import build_train_dict

In [274]:
train = build_train_dict(file_paths, params_data, spatial=False, forecast_step=0)
train3 = build_train_dict(file_paths, params_data, spatial=False, forecast_step=3)
trainr = build_train_dict(file_paths, params_data, spatial=False, atm_source = "RAWS")

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Extracting data from input file data/fmda_nw_202401-05_f05.pkl
loading file data/fmda_nw_202401-05_f05.pkl
2024-10-09 12:55:05,961 - INFO - PLFI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-09 12:55:06,007 - INFO - PLFI1 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-10-09 12:55:06,155 - INFO - SADI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-09 12:55:06,203 - INFO - SADI1 RAWS.time_raws time array increments are min 1.0 max 3.0
2024-10-09 12:55:06,350 - INFO - SRFI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-09 12:55:06,399 - INFO - SRFI1 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-10-09 12:55:06,544 - INFO - WEFI1_202401 HRRR.time time array increments are 1.0 hours
2024-10-09 12:55:06,591 - INFO - WEFI1 RAWS.time_raws time array increments are min 1.0 max 2.0
2024-10-09 12:55:06,738 - INFO - AGFO3_202401 HRRR.time time arra

In [262]:
case

'PLFI1_202401'

In [275]:
train[case]['X'].shape

(3648, 15)

In [276]:
train3[case]['X'].shape

(3648, 15)

In [277]:
trainr[case]['X'].shape

(3648, 12)

In [278]:
train[case]['features_list']

['doy',
 'hod',
 'elev',
 'lon',
 'lat',
 'temp',
 'rh',
 'wind',
 'solar',
 'soilm',
 'canopyw',
 'groundflux',
 'Ed',
 'Ew',
 'rain']

In [279]:
train[case]['X'][0:5, -1]

array([0., 0., 0., 0., 0.])

In [282]:
train[case]['X'][0:5, -2]

array([21.07171093, 21.94879604, 20.42700555, 20.15421267, 20.56482388])

In [283]:
train3[case]['X'][0:5, -2]

array([        nan,         nan,         nan, 18.7451731 , 18.69869469])

In [284]:
train3[case]['X'][0:5, -1]

array([nan, nan, nan,  0.,  0.])

## Test Other ML

In [None]:
params = read_yml("params.yaml", subkey='xgb')
params

In [None]:
dat = read_pkl("data/train.pkl")

In [None]:
cases = [*dat.keys()]

In [None]:
rnn_dat = RNNData(dat[cases[10]], params['scaler'], params['features_list'])
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat.scale_data()

In [None]:
from moisture_models import XGB, RF, LM

In [None]:
mod = XGB(params)

In [None]:
mod.params

In [None]:
mod.fit(rnn_dat.X_train, rnn_dat.y_train)

In [None]:
preds = mod.predict(rnn_dat.X_test)

In [None]:
rmse(preds, rnn_dat.y_test)

In [None]:
plt.plot(rnn_dat.y_test)
plt.plot(preds)

In [None]:
params = read_yml("params.yaml", subkey='rf')
rnn_dat = RNNData(dat[cases[10]], features_list = ['Ed', 'Ew', 'solar', 'wind', 'rain'])
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)

In [None]:
import importlib
import moisture_models
importlib.reload(moisture_models)

In [None]:
params

In [None]:
mod2 = RF(params)
mod2.fit(rnn_dat.X_train, rnn_dat.y_train.flatten())
preds2 = mod2.predict(rnn_dat.X_test)
print(rmse(preds2, rnn_dat.y_test.flatten()))
plt.plot(rnn_dat.y_test)
plt.plot(preds2)

In [None]:
from moisture_models import RF
mod2 = RF(params)

In [None]:
params = read_yml("params.yaml", subkey='lm')
rnn_dat = RNNData(dat[cases[10]], features_list = ['Ed', 'Ew', 'solar', 'wind', 'rain'])
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)
mod = LM(params)

In [None]:
mod.fit(rnn_dat.X_train, rnn_dat.y_train)
preds = mod.predict(rnn_dat.X_test)
print(rmse(preds2, rnn_dat.y_test.flatten()))

## RNN

## Phys Initialized

In [None]:
params.update({
    'epochs':100,
    'dense_layers': 0,
    'activation': ['relu', 'relu'],
    'phys_initialize': False,
    'dropout': [0,0]
})

In [None]:
reproducibility.set_seed()
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat)

In [None]:
rnn.model_train.summary()

In [None]:
params.update({
    'phys_initialize': True,
    'scaler': None, # TODO
    'dense_layers': 0, # NOT including single Dense output layer which is hard-coded
    'activation': ['linear', 'linear'], # TODO tanh, relu the same
    'batch_schedule_type': None # Hopefully this isn't a necessity like before, but maybe it will help
})

In [None]:
rnn_dat2 = RNNData(dat[cases[10]], params['scaler'], params['features_list'])
rnn_dat2.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat2.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN, RNNData

In [None]:
reproducibility.set_seed()

rnn = RNN(params)

In [None]:
m, errs = rnn.run_model(rnn_dat2)

In [None]:
rnn.model_predict.get_weights()

In [None]:
params['rnn_units']

In [None]:
params.update({
    'phys_initialize': True,
    'scaler': None, # TODO
    'dense_layers': 0, # NOT including single Dense output layer which is hard-coded
    'activation': ['relu', 'relu'], # TODO tanh, relu the same
    'batch_schedule_type': None # Hopefully this isn't a necessity like before, but maybe it will help
})

In [None]:
rnn_dat2 = RNNData(dat[cases[10]], params['scaler'], params['features_list'])
rnn_dat2.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat2.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
reproducibility.set_seed()

rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat2)

## LSTM

TODO: FIX BELOW

In [None]:
import importlib 
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN_LSTM

In [None]:
params = read_yml("params.yaml", subkey="lstm")
params = RNNParams(params)

In [None]:
rnn_dat = RNNData(dat[cases[10]], params['scaler'], params['features_list'])
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat.scale_data()
rnn_dat.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
from moisture_rnn import ResetStatesCallback, EarlyStoppingCallback
params.update({'epochs': 50, 'learning_rate': 0.001, 'verbose_fit': True, 'rnn_layers': 2, 'rnn_units': 20, 'dense_layers': 1, 'dense_units': 10,
              'activation': ['tanh', 'tanh'], 'features_list': rnn_dat.features_list,
              'batch_schedule_type':'exp', 'bmin': 10, 'bmax':rnn_dat.hours})
reproducibility.set_seed(123)
lstm = RNN_LSTM(params)

history = lstm.model_train.fit(rnn_dat.X_train, rnn_dat.y_train, 
                    batch_size = params['batch_size'], epochs=params['epochs'], 
                    callbacks = [ResetStatesCallback(params),
                                EarlyStoppingCallback(patience = 15)],
                   validation_data = (rnn_dat.X_val, rnn_dat.y_val))
              

In [None]:
params.update({'epochs': 50, 'learning_rate': 0.001, 'verbose_fit': True, 'rnn_layers': 2, 'rnn_units': 20, 'dense_layers': 1, 'dense_units': 10,
              'activation': ['tanh', 'tanh'], 'features_list': rnn_dat.features_list,
              'batch_schedule_type':'exp', 'bmin': 10, 'bmax':rnn_dat.hours,
              'early_stopping_patience': 25})
reproducibility.set_seed(123)
lstm = RNN_LSTM(params)
m, errs = lstm.run_model(rnn_dat)

In [None]:
rnn_dat.spatial

In [None]:
params = RNNParams(read_yml("params.yaml", subkey='lstm'))
params

In [None]:
train = read_pkl("data/train.pkl")

In [None]:
from itertools import islice
train = {k: train[k] for k in islice(train, 100)}

In [None]:
from data_funcs import combine_nested
rnn_dat_sp = RNNData(
    combine_nested(train), # input dictionary
    scaler="standard",  # data scaling type
    features_list = params['features_list'] # features for predicting outcome
)


rnn_dat_sp.train_test_split(   
    time_fracs = [.8, .1, .1], # Percent of total time steps used for train/val/test
    space_fracs = [.8, .1, .1] # Percent of total timeseries used for train/val/test
)
rnn_dat_sp.scale_data()

rnn_dat_sp.batch_reshape(
    timesteps = params['timesteps'], # Timesteps aka sequence length for RNN input data. 
    batch_size = params['batch_size'] # Number of samples of length timesteps for a single round of grad. descent
)

In [None]:
params.update({
    'loc_batch_reset': rnn_dat_sp.n_seqs # Used to reset hidden state when location changes for a given batch
})

In [None]:
rnn_sp = RNN_LSTM(params)
m_sp, errs = rnn_sp.run_model(rnn_dat_sp)

In [None]:
errs.mean()