# v2.1 run RNN with Spatial Training

This notebook is intended to set up a test where the RNN is run serial by location and compared to the spatial training scheme. Additionally, the ODE model with the augmented KF will be run as a comparison, but note that the RNN models will be predicting entirely without knowledge of the heldout locations, while the augmented KF will be run directly on the test locations.


## Environment Setup

In [None]:
import numpy as np
from utils import print_dict_summary, print_first, str2time, logging_setup
import pickle
import logging
import os.path as osp
from moisture_rnn_pkl import pkl2train
from moisture_rnn import RNNParams, RNNData, RNN 
from utils import hash2, read_yml, read_pkl, retrieve_url, Dict
from moisture_rnn import RNN
import reproducibility
from data_funcs import rmse, to_json, combine_nested, process_train_dict
from moisture_models import run_augmented_kf
import copy
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import time

In [None]:
logging_setup()

In [None]:
retrieve_url(
    url = "https://demo.openwfm.org/web/data/fmda/dicts/fmda_nw_202401-05_f05.pkl", 
    dest_path = "data/fmda_nw_202401-05_f05.pkl")

In [None]:
file_paths = ['data/fmda_nw_202401-05_f05.pkl']

In [None]:
# read/write control
train_file='data/train.pkl'
train_create=True   # if false, read
train_write=True
train_read=True

In [None]:
# Params used for data filtering
params_data = read_yml("params_data.yaml") 
params_data

In [None]:
# Params used for setting up RNN
params = read_yml("params.yaml", subkey='rnn') 
params

In [None]:
if train_create:
    logging.info('creating the training cases from files %s',file_paths)
    # osp.join works on windows too, joins paths using \ or /
    train = process_train_dict(file_paths, params_data = params_data, verbose=True)
if train_write:
    with open(train_file, 'wb') as file:
        logging.info('Writing the rain cases into file %s',train_file)
        pickle.dump(train, file)
if train_read:
    logging.info('Reading the train cases from file %s',train_file)
    train = read_pkl(train_file)

In [None]:
from itertools import islice
train = {k: train[k] for k in islice(train, 150)}

## Setup Validation Runs

In [None]:
params = RNNParams(params)
params.update({'epochs': 200, 
               'learning_rate': 0.001,
               'activation': ['tanh', 'tanh'], # Activation for RNN Layers, Dense layers respectively.
               'recurrent_layers': 2, 'recurrent_units': 30, 
               'dense_layers': 2, 'dense_units': 30,
               'early_stopping_patience': 30, # how many epochs of no validation accuracy gain to wait before stopping
               'batch_schedule_type': 'exp', # Hidden state batch reset schedule
               'bmin': 20, # Lower bound of hidden state batch reset, 
               'bmax': params_data['hours'], # Upper bound of hidden state batch reset, using max hours
               'features_list': ['Ed', 'Ew', 'rain', 'elev', 'lon', 'lat', 'solar', 'wind']
              })

In [None]:
reproducibility.set_seed(123)

## Spatial Data Training

In [None]:
# Start timer for code 
start_time = time.time()

In [None]:
# Combine Nested Dictionary into Spatial Data
train_sp = Dict(combine_nested(train))

In [None]:
rnn_dat_sp = RNNData(
    train_sp, # input dictionary
    scaler="standard",  # data scaling type
    features_list = params['features_list'] # features for predicting outcome
)


rnn_dat_sp.train_test_split(   
    time_fracs = [.8, .1, .1], # Percent of total time steps used for train/val/test
    space_fracs = [.8, .1, .1] # Percent of total timeseries used for train/val/test
)
rnn_dat_sp.scale_data()

rnn_dat_sp.batch_reshape(
    timesteps = params['timesteps'], # Timesteps aka sequence length for RNN input data. 
    batch_size = params['batch_size'] # Number of samples of length timesteps for a single round of grad. descent
)

In [None]:
# Update Params specific to spatial training
params.update({
    'loc_batch_reset': rnn_dat_sp.n_seqs # Used to reset hidden state when location changes for a given batch
})

In [None]:
rnn_sp = RNN(params)
m, errs = rnn_sp.run_model(rnn_dat_sp)

In [None]:
errs.mean()

In [None]:
# End Timer
end_time = time.time()

# Calculate Code Runtime
elapsed_time_sp = end_time - start_time
print(f"Spatial Training Elapsed time: {elapsed_time_sp:.4f} seconds")

## Run ODE + KF and Compare

In [None]:
# Get timeseries IDs from previous RNNData object
test_cases = rnn_dat_sp.loc['test_locs']
print(len(test_cases))

In [None]:
test_ind = rnn_dat_sp.test_ind # Time index for test period start
print(test_ind)

In [None]:
outputs_kf = {}
for case in test_cases:
    print("~"*50)
    print(case)
    # Run Augmented KF
    print('Running Augmented KF')
    train[case]['h2'] = test_ind
    train[case]['scale_fm'] = 1
    m, Ec = run_augmented_kf(train[case])
    y = train[case]['y']        
    train[case]['m_kf'] = m
    print(f"KF RMSE: {rmse(m[test_ind:],y[test_ind:])}")
    outputs_kf[case] = {'case':case, 'errs': rmse(m[test_ind:],y[test_ind:])}

In [None]:
df_kf = pd.DataFrame.from_dict(outputs_kf).transpose()
df_kf.head()

In [None]:
df_kf.errs.mean()

## Serial Training

In [None]:
# Start timer for code 
start_time = time.time()

In [None]:
# Update Params specific to Serial training
params.update({
    'loc_batch_reset': None, # Used to reset hidden state when location changes for a given batch
    'epochs': 2 # less epochs since fit will be run multiple times over locations
})

In [None]:
train_cases = rnn_dat_sp.loc['train_locs']
test_cases = rnn_dat_sp.loc['test_locs']

In [None]:
# Initialize Model with first train case
rnn_dat = RNNData(train[train_cases[0]], params['scaler'], params['features_list'])
rnn_dat.train_test_split(
    time_fracs = [.8, .1, .1]
)
rnn_dat.scale_data()
rnn_dat.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
reproducibility.set_seed()
rnn = RNN(params)

In [None]:
# Train
for case in train_cases:
    print("~"*50)
    print(f"Training with Case {case}")
    rnn_dat_temp = RNNData(train[case], params['scaler'], params['features_list'])
    rnn_dat_temp.train_test_split(
        time_fracs = [.8, .1, .1]
    )
    rnn_dat_temp.scale_data()
    rnn_dat_temp.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])
    rnn.fit(rnn_dat_temp['X_train'], rnn_dat_temp['y_train'],
           validation_data=(rnn_dat_temp['X_val'], rnn_dat_temp['y_val']))    

In [None]:
# Predict
outputs_rnn_serial = {}
test_ind = rnn_dat.test_ind
for i, case in enumerate(test_cases):
    print("~"*50)
    rnn_dat_temp = RNNData(train[case], params['scaler'], params['features_list'])
    rnn_dat_temp.train_test_split(
        time_fracs = [.8, .1, .1]
    )
    rnn_dat_temp.scale_data()
    rnn_dat_temp.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])    
    X_temp = rnn_dat_temp.scale_all_X()
    m = rnn.predict(X_temp)
    outputs_rnn_serial[case] = {'case':case, 'errs': rmse(m[test_ind:], rnn_dat.y_test)}

In [None]:
df_rnn_serial = pd.DataFrame.from_dict(outputs_rnn_serial).transpose()
df_rnn_serial.head()

In [None]:
df_rnn_serial.errs.mean()

In [None]:
# End Timer
end_time = time.time()

# Calculate Code Runtime
elapsed_time_ser = end_time - start_time
print(f"Serial Training Elapsed time: {elapsed_time_ser:.4f} seconds")

## Compare

In [None]:
print(f"Total Test Cases: {len(test_cases)}")
print(f"Total Test Hours: {rnn_dat_temp.y_test.shape[0]}")

In [None]:
print(f"Spatial Training RMSE: {errs.mean()}")
print(f"Serial Training RMSE: {df_rnn_serial.errs.mean()}")
print(f"Augmented KF RMSE: {df_kf.errs.mean()}")

In [None]:
print(f"Spatial Training Elapsed time: {elapsed_time_sp:.4f} seconds")
print(f"Serial Training Elapsed time: {elapsed_time_ser:.4f} seconds")