# Compare Batch Resetting Schedules


In [None]:
import numpy as np
import pickle
import logging
import os.path as osp
import sys
sys.path.append('..')
from moisture_rnn_pkl import pkl2train
from moisture_rnn import RNNParams, RNNData, RNN 
from utils import hash2, read_yml, read_pkl, retrieve_url, print_dict_summary, print_first, str2time, logging_setup
from moisture_rnn import RNN
import reproducibility
from data_funcs import rmse, to_json, combine_nested, build_train_dict, subset_by_features
from moisture_models import run_augmented_kf
import copy
import pandas as pd
import matplotlib.pyplot as plt
import yaml
import time
import reproducibility
import tensorflow as tf

In [None]:
logging_setup()

In [None]:
filename = "fmda_rocky_202403-05_f05.pkl"
retrieve_url(
    url = f"https://demo.openwfm.org/web/data/fmda/dicts/{filename}", 
    dest_path = f"../data/{filename}")

In [None]:
train_create=True
train_write=True
train_read=True

In [None]:
file_names=[filename]
file_dir='../data'
file_paths = [osp.join(file_dir,file_name) for file_name in file_names]

In [None]:
params = RNNParams(read_yml("../params.yaml", subkey='rnn'))
params_data = read_yml("../params_data.yaml")

In [None]:
## params = RNNParams(read_yml("params.yaml", subkey="rnn"))
params.update({'epochs': 200, 
               'learning_rate': 0.001,
               'activation': ['tanh', 'tanh'], # Activation for RNN Layers, Dense layers respectively.
               'rnn_layers': 2, 'recurrent_units': 30, 
               'dense_layers': 2, 'dense_units': 30,
               'early_stopping_patience': 30, # how many epochs of no validation accuracy gain to wait before stopping
               'batch_schedule_type': 'exp', # Hidden state batch reset schedule
               'bmin': 20, # Lower bound of hidden state batch reset, 
               'bmax': params_data['hours'], # Upper bound of hidden state batch reset, using max hours
               'features_list': ['Ed', 'Ew', 'rain', 'elev', 'lon', 'lat', 'solar', 'wind'],
               'timesteps': 12,
               'batch_size': 50,
               'time_fracs': [.9, .05, .05],
               'space_fracs': [.8, .1, .1]
              })

In [None]:
params_data.update({
    'hours': 2208,
    'max_intp_time': 12,
    'zero_lag_threshold': 12
})


In [None]:
if train_create:
    # Not doing spatial combine yet since we want to filter locations to those with complete RAWS sensors
    train = build_train_dict(file_paths, atm_source="HRRR", 
                             params_data = params_data, 
                             forecast_step = 0, drop_na = True,
                             spatial=False, verbose=True)
    train1 = build_train_dict(file_paths, atm_source="HRRR", 
                             params_data = params_data, 
                             forecast_step = 1, drop_na = True,
                             spatial=False, verbose=True)
    train2 = build_train_dict(file_paths, atm_source="HRRR", 
                             params_data = params_data, 
                             forecast_step = 2, drop_na = True,
                             spatial=False, verbose=True)
    train3 = build_train_dict(file_paths, atm_source="HRRR", 
                             params_data = params_data, 
                             forecast_step = 3, drop_na = True,
                             spatial=False, verbose=True)
    trainr = build_train_dict(file_paths, atm_source="RAWS", 
                             params_data = params_data, 
                             spatial=False, verbose=True,
                             features_subset = params['features_list']
                             )

    trainr = subset_by_features(trainr, input_features = params['features_list'])

    # Subset HRRR dicts to those with complete RAWS sensors
    train = {k: train[k] for k in train if k in trainr}
    train1 = {k: train1[k] for k in train1 if k in trainr}
    train2 = {k: train2[k] for k in train2 if k in trainr}
    train3 = {k: train3[k] for k in train3 if k in trainr}

    trainr = combine_nested(trainr)
    if train_write:
        with open("../data/train_raws.pkl", 'wb') as file:
            pickle.dump(trainr, file)
        with open("../data/train_0hr.pkl", 'wb') as file:
            pickle.dump(combine_nested(train), file)
        with open("../data/train_1hr.pkl", 'wb') as file:
            pickle.dump(combine_nested(train1), file)
        with open("../data/train_2hr.pkl", 'wb') as file:
            pickle.dump(combine_nested(train2), file)
        with open("../data/train_3hr.pkl", 'wb') as file:
            pickle.dump(combine_nested(train3), file)

In [None]:
# Read
if train_read:
    trainr = read_pkl("../data/train_raws.pkl")
    train = read_pkl("../data/train_0hr.pkl")
    train1 = read_pkl("../data/train_1hr.pkl")
    train2 = read_pkl("../data/train_2hr.pkl")
    train3 = read_pkl("../data/train_3hr.pkl")

## Handle Data

In [None]:
def rnn_data_wrap(dict0, params):
    rnn_dat = RNNData(
        dict0, # input dictionary
        scaler="standard",  # data scaling type
        features_list = params['features_list'] # features for predicting outcome
    )
    
    
    rnn_dat.train_test_split(   
        time_fracs = params['time_fracs'], # Percent of total time steps used for train/val/test
        space_fracs = params['space_fracs'] # Percent of total timeseries used for train/val/test
    )
    rnn_dat.scale_data()
    
    rnn_dat.batch_reshape(
        timesteps = params['timesteps'], # Timesteps aka sequence length for RNN input data. 
        batch_size = params['batch_size'], # Number of samples of length timesteps for a single round of grad. descent
        start_times = np.zeros(len(rnn_dat.loc['train_locs']))
    )    
    
    return rnn_dat

In [None]:
fstep_m = [] # model outputs
fstep_errs = [] # errors

In [None]:
train_fsteps = [train, train1, train2, train3]

In [None]:
for train_i in train_fsteps:
    print("~"*100)
    print(f"Running Model for Forecast Step: {train_i['forecast_step'][0]}")
    reproducibility.set_seed()
    data = rnn_data_wrap(train_i, params)
    params.update({
        'loc_batch_reset': data.n_seqs # Used to reset hidden state when location changes for a given batch
    })
    
    rnn = RNN(params)
    m, errs = rnn.run_model(data)

    print(f"Test RMSE: {errs.mean()}")
    
    fstep_m.append(m)
    fstep_errs.append(errs)    

In [None]:
print("~"*100)
print(f"Running Model for RAWS atmospheric data")
reproducibility.set_seed()
data = rnn_data_wrap(trainr, params)
params.update({
    'loc_batch_reset': data.n_seqs # Used to reset hidden state when location changes for a given batch
})

rnn = RNN(params)
m, errs = rnn.run_model(data)

fstep_m.append(m)
fstep_errs.append(errs)    

## Compare 

In [None]:
[np.mean(array) for array in fstep_errs]

## Analyze Weights

In [None]:
rnn.model_train.summary()

In [None]:
rnn.model_train.get_weights()[0].shape

In [None]:
params['n_features']

In [None]:
params['rnn_units']

In [None]:
params['features_list'].index('rain')

In [None]:
rnn.model_train.get_weights()[0][2,:]

In [None]:
feature_weights = np.mean(np.abs(rnn.model_train.get_weights()[0]), axis=1)

In [None]:
feature_weights

In [None]:
for i, feat in enumerate(params['features_list']):
    print(f"Feature {feat} mean input weight: {feature_weights[i]}")

In [None]:
trainr['features_list'][-1]

In [None]:
train['features_list'][-1]

In [None]:
train3['features_list'][-1]

In [None]:
rmse(np.vstack(trainr['X'])[:, -1], np.vstack(train['X'])[:, -1])

In [None]:
np.sqrt(np.mean(np.vstack(trainr['X'])[:, -1]**2))

In [None]:
rmse(
    np.vstack([array[1:] for array in trainr["X"]])[:,-1], 
    np.vstack(train1['X'])[:, -1]
)

In [None]:
rmse(
    np.vstack([array[2:] for array in trainr["X"]])[:,-1], 
    np.vstack(train2['X'])[:, -1]
)

In [None]:
rmse(
    np.vstack([array[3:] for array in trainr["X"]])[:,-1], 
    np.vstack(train3['X'])[:, -1]
)

In [None]:
trainr['features_list']

In [None]:
train3['features_list']

In [None]:
rmse(
    np.vstack([array[3:] for array in trainr["X"]])[:,-3], 
    np.vstack(train3['X'])[:, -3]
)