# v2.1 run RNN strategy serial by Location

This version of the RNN runs the model on each location separately, one at a time. Two main runs:
1. Run separate model at each location - training and prediction at least location independently - training mode periods 0:train_ind (was 0:h2), then prediction in test_ind:end. Validation data, if any, are from train_ind:test_ind
2. Run same model with multiple fitting calls 0:train_ind at different locations, compare prediction accuracy in test_ind:end  at for all location. 


In [1]:
import numpy as np
from utils import print_dict_summary, print_first, str2time, logging_setup
import pickle
import logging
import os.path as osp
from moisture_rnn_pkl import pkl2train
from moisture_rnn import RNNParams, RNNData, RNN 
from utils import hash2, read_yml, read_pkl, retrieve_url
from moisture_rnn import RNN
import reproducibility
from data_funcs import rmse, to_json
from moisture_models import run_augmented_kf
import copy
import pandas as pd
import matplotlib.pyplot as plt
import yaml

2024-09-20 17:43:51.718358: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
logging_setup()

In [3]:
retrieve_url(
    url = "https://demo.openwfm.org/web/data/fmda/dicts/test_CA_202401.pkl", 
    dest_path = "fmda_nw_202401-05_f05.pkl")

Attempting to downloaded https://demo.openwfm.org/web/data/fmda/dicts/test_CA_202401.pkl to fmda_nw_202401-05_f05.pkl


--2024-09-20 17:43:56--  https://demo.openwfm.org/web/data/fmda/dicts/test_CA_202401.pkl
Resolving demo.openwfm.org (demo.openwfm.org)... 167.99.232.12
Connecting to demo.openwfm.org (demo.openwfm.org)|167.99.232.12|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8551696 (8.2M) [application/octet-stream]
Saving to: ‘fmda_nw_202401-05_f05.pkl’

     0K .......... .......... .......... .......... ..........  0%  850K 10s
    50K .......... .......... .......... .......... ..........  1%  915K 9s
   100K .......... .......... .......... .......... ..........  1% 27.9M 6s
   150K .......... .......... .......... .......... ..........  2%  160M 5s
   200K .......... .......... .......... .......... ..........  2%  918K 6s
   250K .......... .......... .......... .......... ..........  3% 70.9M 5s
   300K .......... .......... .......... .......... ..........  4% 63.1M 4s
   350K .......... .......... .......... .......... ..........  4% 76.5M 3s
   400K .......... 

Successfully downloaded https://demo.openwfm.org/web/data/fmda/dicts/test_CA_202401.pkl to fmda_nw_202401-05_f05.pkl


....... .......... .......... .......... .......... 99%  409K 0s
  8350K .                                                     100% 25.3K=1.9s

2024-09-20 17:43:58 (4.40 MB/s) - ‘fmda_nw_202401-05_f05.pkl’ saved [8551696/8551696]



In [4]:
repro_file = "data/reproducibility_dict_v2_TEST.pkl"
file_names=['fmda_nw_202401-05_f05.pkl']
file_dir='data'
file_paths = [osp.join(file_dir,file_name) for file_name in file_names]

In [5]:
# read/write control
train_file='data/train.pkl'
train_create=True   # if false, read
train_write=True
train_read=True

In [6]:
repro = read_pkl(repro_file)

if train_create:
    logging.info('creating the training cases from files %s',file_paths)
    # osp.join works on windows too, joins paths using \ or /
    train = pkl2train(file_paths)
if train_write:
    with open(train_file, 'wb') as file:
        logging.info('Writing the rain cases into file %s',train_file)
        pickle.dump(train, file)
if train_read:
    logging.info('Reading the train cases from file %s',train_file)
    train = read_pkl(train_file)

loading file data/reproducibility_dict_v2_TEST.pkl
2024-09-20 17:43:58,780 - INFO - creating the training cases from files ['data/fmda_nw_202401-05_f05.pkl']
2024-09-20 17:43:58,782 - INFO - Using data from step f01
2024-09-20 17:43:58,782 - INFO - Using rain as the difference of accumulated precipitation between f01 and f00


FileNotFoundError: [Errno 2] No such file or directory: 'data/fmda_nw_202401-05_f05.pkl'

In [None]:
params_all = read_yml("params.yaml")
print(params_all.keys())

In [None]:
# from module_param_sets import param_sets

In [None]:
param_sets_keys=['rnn']
cases=list(train.keys())[0:50]
# cases=list(train.keys())
# cases.remove('reproducibility')
cases

In [None]:
logging.info('Running over parameter sets %s',param_sets_keys)
logging.info('Running over cases %s',cases)

## Run Reproducibility Case

In [None]:
params = repro['repro_info']['params']
print(type(params))
print(params)

# Set up input data
rnn_dat = RNNData(repro, scaler = params['scaler'], features_list = params['features_list'])
rnn_dat.train_test_split(
    time_fracs = params['time_fracs']
)
rnn_dat.scale_data()
rnn_dat.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
reproducibility.set_seed(123)
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat, reproducibility_run=True)

## Separate Models by Location

In [None]:
# Set up output dictionaries
outputs_kf = {}
outputs_rnn = {}

In [None]:

for k in param_sets_keys:
    params = RNNParams(params_all[k])
    print("~"*80)
    print("Running with params:")
    print(params)
    # Increase Val Frac so no errors, TODO fix validation
    params.update({
        'train_frac': .9,
        'val_frac': .05,
        'activation': ['relu', 'relu'],
        'epochs': 10,
        'dense_units': 10,
        'rnn_layers': 2       
    })
    for case in cases:
        print("~"*50)
        logging.info('Processing case %s',case)
        print_dict_summary(train[case])
        # Format data & Run Model
        # rnn_dat = create_rnn_data2(train[case], params)
        rnn_dat = RNNData(train[case], scaler = params['scaler'], features_list = params['features_list'])
        rnn_dat.train_test_split(
            time_fracs = [.9, .05, .05]
        )
        rnn_dat.scale_data()
        rnn_dat.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])
        params.update({'bmax': rnn_dat.hours})
        reproducibility.set_seed()
        rnn = RNN(params)
        m, errs = rnn.run_model(rnn_dat, plot_period="predict")
        # Add model output to case
        train[case]['m_rnn']=m
        # Get RMSE Prediction Error
        print(f"RMSE: {errs}")
        outputs_rnn[case] = {'case':case, 'errs': errs.copy()}
        
        # Run Augmented KF
        print('Running Augmented KF')
        train[case]['h2'] = rnn_dat.test_ind
        train[case]['scale_fm'] = 1
        m, Ec = run_augmented_kf(train[case])
        y = rnn_dat['y']        
        train[case]['m_kf'] = m
        print(f"KF RMSE: {rmse(m[rnn_dat.test_ind:],y[rnn_dat.test_ind:])}")
        outputs_kf[case] = {'case':case, 'errs': rmse(m[rnn_dat.test_ind:],y[rnn_dat.test_ind:])}

        # Save Outputs 
        to_json(outputs_rnn, "rnn_errs.json")
        to_json(outputs_kf, "kf_errs.json")

In [None]:
logging.info('fmda_rnn_serial.ipynb done')

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN

In [None]:
for k in outputs_rnn:
    print("~"*50)
    print(outputs_rnn[k]['case'])
    print(outputs_rnn[k]['errs']['prediction'])

In [None]:
for k in outputs_kf:
    print("~"*50)
    print(outputs_kf[k]['case'])
    print(outputs_kf[k]['errs'])

### TODO: FIX SCALING in Scheme below

Scaling is done separately in each now.

## Training at Multiple Locations

Still sequential

In [None]:
params = RNNParams(params_all['rnn'])
params.update({
    'epochs': 1, # less epochs since it is per location
    'activation': ['relu', 'relu'],
    'train_frac': .9,
    'val_frac': .05,    
    'dense_units': 10,
    'rnn_layers': 2
})

# rnn_dat = create_rnn_data2(train[cases[0]], params)
rnn_dat = RNNData(train[cases[0]], params['scaler'], params['features_list'])
rnn_dat.train_test_split(
    time_fracs = [.9, .05, .05]
)
rnn_dat.scale_data()
rnn_dat.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
reproducibility.set_seed()
rnn = RNN(params)

In [None]:
print("~"*80)
print("Running with params:")
print(params)

for case in cases[0:10]:
    print("~"*50)
    logging.info('Processing case %s',case)
    print_dict_summary(train[case])
    rnn_dat_temp = RNNData(train[case], params['scaler'], params['features_list'])
    rnn_dat_temp.train_test_split(
        time_fracs = [.9, .05, .05]
    )
    rnn_dat_temp.scale_data()
    rnn_dat_temp.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])
    rnn.fit(rnn_dat_temp['X_train'], rnn_dat_temp['y_train'],
           validation_data=(rnn_dat_temp['X_val'], rnn_dat_temp['y_val']))
    # run_rnn_pkl(train[case],param_sets[i])

### Predict 

In [None]:
# Predict Cases Used in Training
rmses = []
inds = np.arange(0,10)
train_keys = list(train.keys())
for i in inds:
    print("~"*50)
    case = train_keys[i]
    print(f"Predicting case {case}")
    # rnn_dat = create_rnn_data2(train[case], params)
    rnn_dat_temp = RNNData(train[case], params['scaler'], params['features_list'])
    rnn_dat_temp.train_test_split(
        time_fracs = [.9, .05, .05]
    )
    rnn_dat_temp.scale_data()
    rnn_dat_temp.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])
    X_temp = rnn_dat_temp.scale_all_X()
    m = rnn.predict(X_temp)
    test_ind = rnn_dat['test_ind']
    rmses.append(rmse(m[test_ind:], rnn_dat['y_test'].flatten()))

In [None]:
rmses

In [None]:
pd.DataFrame({'Case': list(train.keys())[0:10], 'RMSE': rmses}).style.hide(axis="index")

In [None]:
# Predict New Locations
rmses = []
for i, case in enumerate(list(train.keys())[10:100]):
    print("~"*50)
    print(f"Predicting case {case}")
    rnn_dat_temp = RNNData(train[case], params['scaler'], params['features_list'])
    rnn_dat_temp.train_test_split(
        time_fracs = [.9, .05, .05]
    )
    rnn_dat_temp.scale_data()
    rnn_dat_temp.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])
    X = rnn_dat_temp.scale_all_X()
    m = rnn.predict(X)
    train[case]['m'] = m
    test_ind = rnn_dat['test_ind']
    rmses.append(rmse(m[test_ind:], rnn_dat.y_test.flatten()))

df = pd.DataFrame({'Case': list(train.keys())[10:100], 'RMSE': rmses})

In [None]:
df[0:5].style.hide(axis="index")

In [None]:
df

In [None]:
df.RMSE.mean()

In [None]:
plt.hist(df.RMSE)