# v2.1 exploration trying to make it work better

In [None]:
# Environment
import os
import os.path as osp
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sys
# Local modules
sys.path.append('..')
import reproducibility
import pandas as pd
from utils import print_dict_summary
from data_funcs import rmse
from moisture_rnn import RNNParams, RNNData, RNN, RNN_LSTM, create_rnn_data2
from moisture_rnn_pkl import pkl2train
from tensorflow.keras.callbacks import Callback
from utils import hash2
import copy
import logging
import pickle
from utils import logging_setup, read_yml, read_pkl, hash_ndarray, hash_weights
import yaml
import copy

In [None]:
logging_setup()

## Test Batch Size

In [None]:
train = read_pkl('train.pkl')
train.keys()

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN, RNNData

In [None]:
params = read_yml("params.yaml", subkey="rnn")
params = RNNParams(params)
params.update({'batch_size': 7})
rnn_dat = RNNData(train['PLFI1_202401'], scaler=params['scaler'], features_list = params['features_list'])
rnn_dat.train_test_split(
    train_frac = .9,
    val_frac = .05
)
rnn_dat.scale_data()
rnn_dat.batch_reshape(timesteps = params['timesteps'], batch_size = params['batch_size'])

In [None]:
params.update({'epochs': 10})

In [None]:
reproducibility.set_seed()
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat)

In [None]:
rnn.model_train.summary()

## Test Spatial Data

In [None]:
train = read_pkl('train.pkl')

In [None]:
from itertools import islice
dat = {k: train[k] for k in islice(train, 3)}

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNNData
import data_funcs
importlib.reload(data_funcs)
from data_funcs import combine_nested

In [None]:
dd = combine_nested(dat)

In [None]:
dd.keys()

In [None]:
# d = RNNData(dat)

In [None]:
def batch_setup(x, batch_size):
    # Ensure x is a numpy array
    x = np.array(x)
    
    # Initialize the list to hold the batches
    batches = []
    
    # Use a loop to slice the list/array into batches
    for i in range(0, len(x), batch_size):
        batch = list(x[i:i + batch_size])
        
        # If the batch is not full, continue from the start
        while len(batch) < batch_size:
            # Calculate the remaining number of items needed
            remaining = batch_size - len(batch)
            # Append the needed number of items from the start of the array
            batch.extend(x[:remaining])
        
        batches.append(batch)
    
    return batches

In [None]:
params = read_yml("params.yaml", subkey="rnn")
params = RNNParams(params)
params

In [None]:
params.update({'batch_size': 2})

In [None]:
loc_ids = np.arange(len(dat.keys()))
loc_names = list(dat.keys())
start_times = [0,1,2]
loc_batches, t_batch =  batch_setup(loc_ids, params['batch_size']), batch_setup(start_times, params['batch_size'])

In [None]:
print(loc_batches)
print(t_batch)

In [None]:
from moisture_rnn import staircase_2

Xs = []
ys = []
hours = 100 # number of timesteps to use in training data
for i in range(0, params["batch_size"]):
    locs = loc_batches[i]
    ts = t_batch[i]
    for j in range(0, len(locs)):
        loc = loc_names[locs[j]]
        t0 = ts[j]
        tend = t0 + hours
        # Create RNNData Dict
        # dat_temp = RNNData(dat[loc], scaler = params['scaler'], features_list = params['features_list'])
        # Subset data to given location and time from t0 to t0+hours
        times = dat[loc]['time'] # get time vector from 
        times = times[t0:tend] # subset 
        X_temp = dat[loc]['X'][t0:tend,:]
        y_temp = dat[loc]['y'][t0:tend].reshape(-1,1)
        # Subset Features
        indices = []
        for item in params['features_list']:
            if item in dat[loc]['features_list']:
                indices.append(dat[loc]['features_list'].index(item))
        X_temp = X_temp[:, indices]
        # Format sequences
        Xi, yi = staircase_2(
            X_temp, 
            y_temp, 
            timesteps = params['timesteps'], 
            batch_size = 1,  # note: using 1 here to format sequences for a single location, not same as target batch size for training data
            verbose=False)
    
        Xs.append(Xi)
        ys.append(yi)

In [None]:
lens = [yi.shape[0] for yi in ys]
min_shape = min(lens)
Xs = [Xi[:min_shape] for Xi in Xs]
ys = [yi[:min_shape] for yi in ys]

In [None]:
XXs = []
yys = []
for i in range(0, len(loc_batches)):
    locs = loc_batches[i]
    XXi = np.empty((Xs[0].shape[0]*params['batch_size'], 5, params['n_features']))
    yyi = np.empty((Xs[0].shape[0]*params['batch_size'], 1))
    for j in range(0, len(locs)):
        XXi[j::(params['batch_size'])] =  Xs[locs[j]]
        yyi[j::(params['batch_size'])] =  ys[locs[j]]
    XXs.append(XXi)
    yys.append(yyi)

In [None]:
# TODO: CONFIRM THIS Y
yy = np.concatenate(yys, axis=0)
XX = np.concatenate(XXs, axis=0)
print(XX.shape)
print(yy.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(XX.reshape(-1, XX.shape[-1])).reshape(XX.shape)

In [None]:
params.update({'epochs': 50, 'verbose_fit': True, 'rnn_layers': 2, 'dense_layers': 2, 'dense_units': 10,
              'activation': ['tanh', 'tanh']})
reproducibility.set_seed(123)
rnn = RNN(params)

In [None]:
rnn.model_train.fit(X, yy, batch_size = params['batch_size'], epochs=100)

In [None]:
rnn.model_train.loss

In [None]:
np.sqrt(16.75)

In [None]:
import importlib
import moisture_rnn_pkl
importlib.reload(moisture_rnn_pkl)
from moisture_rnn_pkl import pkl2train

In [None]:
file_names=['test_CA_202401.pkl', 'test_NW_202401.pkl']
file_dir='data'
file_paths = [osp.join(file_dir,file_name) for file_name in file_names]

In [None]:
train = pkl2train(file_paths)

In [None]:
train.keys()

## LSTM

TODO: FIX BELOW

In [None]:
import importlib 
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN_LSTM

In [None]:
with open("params.yaml") as file:
    params = yaml.safe_load(file)["lstm"]
    
rnn_dat2 = create_rnn_data2(train[case],params)

In [None]:
params.update({'epochs': 10})

In [None]:
reproducibility.set_seed()
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat2)

In [None]:
import importlib
importlib.reload(moisture_rnn)
from moisture_rnn import RNN_LSTM

In [None]:
with open("params.yaml") as file:
    params = yaml.safe_load(file)["lstm"]

rnn_dat2 = create_rnn_data2(train[case],params)
params

In [None]:
params.update({
    'learning_rate': 0.000001,
    'epochs': 10,
    'clipvalue':1.0
})

In [None]:
reproducibility.set_seed()
lstm = RNN_LSTM(params)
m, errs = lstm.run_model(rnn_dat2)