In [1]:
import numpy as np
from utils import print_dict_summary, print_first, str2time, logging_setup
import pickle
import logging
import os.path as osp
from moisture_rnn_pkl import pkl2train
from moisture_rnn import create_rnn_data2 
from utils import hash2
from moisture_rnn import RNN
import reproducibility
from data_funcs import rmse



# RNN Sequential by Location

This version of the RNN runs the model on each location separately, one at a time. Two main runs:
1. Run separate model at each location
2. Run same model with multiple fitting calls at different locations, compare prediction accuracy for all of them at the end.

In [2]:
logging_setup()

In [3]:
file_names=["reproducibility_dict2.pickle",'test_NW_202401.pkl','test_CA_202401.pkl']
file_dir='data'
file_paths = [osp.join(file_dir,file_name) for file_name in file_names]

In [4]:
# read/write control
train_file='train.pkl'
train_create=True   # if false, read
train_write=True
train_read=True

In [5]:
if train_create:
    logging.info('creating the training cases from files %s',file_paths)
    # osp.join works on windows too, joins paths using \ or /
    train = pkl2train(file_paths)
if train_write:
    with open(train_file, 'wb') as file:
        logging.info('Writing the rain cases into file %s',train_file)
        pickle.dump(train, file)
if train_read:
    logging.info('Reading the train cases from file %s',train_file)
    with open(train_file,'rb') as file:
        train=pickle.load(file)

2024-06-18 11:36:47,993 - INFO - creating the training cases from files ['data\\reproducibility_dict2.pickle', 'data\\test_NW_202401.pkl', 'data\\test_CA_202401.pkl']
2024-06-18 11:36:47,994 - INFO - Using data from step f01
2024-06-18 11:36:47,995 - INFO - Using rain as the difference of accumulated precipitation between f01 and f00
2024-06-18 11:36:48,013 - INFO - loading file data\reproducibility_dict2.pickle
2024-06-18 11:36:48,021 - INFO - Processing subdictionary reproducibility
2024-06-18 11:36:48,033 - INFO - reproducibility RAWS.time time array increments are 1.0 hours
2024-06-18 11:36:48,033 - INFO - RAWS increment is 1.0 h
2024-06-18 11:36:48,034 - INFO - Created feature matrix train[reproducibility]['X'] shape (854, 3)
2024-06-18 11:36:48,045 - INFO - reproducibility RAWS.time_raws length is 854
2024-06-18 11:36:48,046 - INFO - reproducibility RAWS.time_raws time array increments are 1.0 hours
2024-06-18 11:36:48,047 - INFO - reproducibility RAWS.fm length is 854
2024-06-18

In [6]:
import yaml

with open("params.yaml") as file:
    params_all = yaml.safe_load(file)
params_all.keys()

dict_keys(['rnn', 'lstm', 'physics_initializer', 'rnn_repro'])

In [7]:
# from module_param_sets import param_sets

In [None]:
param_sets_keys=['rnn_repro', 'rnn']
# cases=[list(train.keys())[0]]
cases=list(train.keys())[0:10]
cases.remove('reproducibility')
cases

In [None]:
logging.info('Running over parameter sets %s',param_sets_keys)
logging.info('Running over cases %s',cases)

## Separate Models by Location

In [None]:
for k in param_sets_keys:
    params = params_all[k]
    print("~"*80)
    print("Running with params:")
    print(params)
    if k == "rnn_repro":
        case = 'reproducibility'
        logging.info('Running reproducibility case')
        rnn_dat = create_rnn_data2(train[case], params)
        reproducibility.set_seed()
        rnn = RNN(params)
        errs = rnn.run_model(rnn_dat)
        print(f"Mode RMSE: {errs}")
    else:
        for case in cases:
            # Increase Val Frac so no errors, TODO fix validation
            params.update({
                'train_frac': .5,
                'val_frac': .2
            })
            print("~"*50)
            logging.info('Processing case %s',case)
            print_dict_summary(train[case])
            rnn_dat = create_rnn_data2(train[case], params)
            reproducibility.set_seed()
            rnn = RNN(params)
            errs = rnn.run_model(rnn_dat)
            print(f"Mode RMSE: {errs}")
            # run_rnn_pkl(train[case],param_sets[i])

In [None]:
logging.info('test-plk2train.ipynb done')

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN

## Training at Multiple Locations

Still sequential

In [None]:
params = params_all['rnn']
params.update({'epochs': 20})
rnn_dat = create_rnn_data2(train['reproducibility'], params)

In [None]:
reproducibility.set_seed()
rnn = RNN(params)

In [None]:
for k in param_sets_keys:
    params = params_all[k]
    print("~"*80)
    print("Running with params:")
    print(params)
    if k == "rnn_repro":
        continue
    else:
        for case in cases:
            # Increase Val Frac so no errors, TODO fix validation
            params.update({
                'train_frac': .5,
                'val_frac': .2
            })
            print("~"*50)
            logging.info('Processing case %s',case)
            print_dict_summary(train[case])
            rnn_dat = create_rnn_data2(train[case], params)
            rnn.fit(rnn_dat['X_train'], rnn_dat['y_train'],
                   validation_data=(rnn_dat['X_val'], rnn_dat['y_val']))
            # run_rnn_pkl(train[case],param_sets[i])

### Predict on all locations in dictionary

In [None]:
errs = np.zeros(len(train))
for i, case in enumerate(train.keys()):
    print("~"*50)
    print(f"Predicting case {case}")
    rnn_dat = create_rnn_data2(train[case], params)
    m = rnn.predict(rnn_dat["X"])
    test_ind = rnn_dat['test_ind']
    errs[i] = rmse(m[test_ind:], rnn_dat['y_test'].flatten())

In [None]:
errs

In [None]:
errs.mean()