# v2 exploration trying to make it work better

In [None]:
# Environment
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import sys
# Local modules
sys.path.append('..')
import reproducibility
import pandas as pd
from utils import print_dict_summary
from data_funcs import load_and_fix_data, rmse
from moisture_rnn import RNNParams, RNN, RNN_LSTM, create_rnn_data2
from moisture_rnn_pkl import pkl2train
from tensorflow.keras.callbacks import Callback
from sklearn.metrics import mean_squared_error
from utils import hash2
import copy
import logging
import pickle
from utils import logging_setup, read_yml
import yaml
import copy

In [None]:
logging_setup()

## Test Data Creation

In [None]:
train_file='train.pkl'
with open(train_file,'rb') as file:
    train=pickle.load(file)

In [None]:
params = read_yml("params.yaml", subkey="rnn")
params = RNNParams(params)

In [None]:
import importlib
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import create_rnn_data2

In [None]:
rnn_dat = create_rnn_data2(train['PIVC1_202401'], params)

In [None]:
rnn_dat['y'].shape

In [None]:
rnn_dat.keys()

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scalers = {
    'minmax': MinMaxScaler(),
    'standard': StandardScaler() 
}

In [None]:
def all_items_exist(source_list, target_list):
    return all(item in target_list for item in source_list)

class RNNData(dict):
    required_keys = {"loc", "time", "X", "y", "features_list"}  
    def __init__(self, input_dict, features_list=None, scaler=None):
        # Copy to avoid 
        input_data = input_dict.copy()
        super().__init__(input_data)
        self.scaler = None
        if scaler is not None:
            self.set_scaler(scaler)
        self['hours'] = len(self['y'])
        self['all_features_list'] = self.pop('features_list')
        if features_list is None:
            print("Using all input features.")
            self.features_list = self.all_features_list
        else:
            self.features_list = features_list
        self.run_checks()
        self.__dict__.update(self)
    def run_checks(self, verbose=True):
        missing_keys = self.required_keys - self.keys()
        if missing_keys:
            raise KeyError(f"Missing required keys: {missing_keys}")
        # Check y 1-d
        y_shape = np.shape(self.y)
        if not (len(y_shape) == 1 or (len(y_shape) == 2 and y_shape[1] == 1)):
            raise ValueError(f"'y' must be one-dimensional, with shape (N,) or (N, 1). Current shape is {y_shape}.")
        
        # Check if 'hours' is provided and matches len(y)
        if 'hours' in self:
            if self.hours != len(self.y):
                raise ValueError(f"Provided 'hours' value {self.hours} does not match the length of 'y', which is {len(self.y)}.")
        # Check desired subset of features is in all input features
        if not all_items_exist(self.features_list, self.all_features_list):
            raise ValueError(f"Provided 'features_list' {self.features_list} has elements not in input features.")
    def set_scaler(self, scaler):
        recognized_scalers = ['minmax', 'standard']
        if scaler in recognized_scalers:
            self.scaler = scalers[scaler]
        else:
            raise ValueError(f"Unrecognized scaler '{scaler}'. Recognized scalers are: {recognized_scalers}.")
    def train_test_split(self, train_frac, val_frac=0.0, subset_features=True, features_list=None, split_time=True, split_space=False, verbose=True):
        # Extract data to desired features
        X = self.X.copy()
        y = self.y.copy()
        if subset_features:
            if verbose and d.features_list != d.all_features_list:
                print(f"Subsetting input data to features_list: {self.features_list}")
            # Indices to subset all features with based on params features
            indices = []
            for item in self.features_list:
                if item in self.all_features_list:
                    indices.append(self.all_features_list.index(item))
                else:
                    print(f"Warning: feature name '{item}' not found in list of all features from input data")
            X = X[:, indices]
        # Setup train/test in time
        train_ind = int(np.floor(self.hours * train_frac)); self.train_ind = train_ind
        test_ind= int(train_ind + round(self.hours * val_frac)); self.test_ind = test_ind

        # Check for any potential issues with indices
        if test_ind > self.hours:
            print("Setting test index to {self.hours}")
            test_ind = self.hours
        if train_ind >= test_ind:
            raise ValueError("Train index must be less than test index.")        
        
        # Training data from 0 to train_ind
        self.X_train = X[:train_ind]
        self.y_train = y[:train_ind].reshape(-1,1) # assumes y 1-d, change this if vector output
        # Validation data from train_ind to test_ind
        if val_frac >0:
            self.X_val = X[train_ind:test_ind]
            self.y_val = y[train_ind:test_ind].reshape(-1,1) # assumes y 1-d, change this if vector output
        # Test data from test_ind to end
        self.X_test = X[test_ind:]
        self.y_test = y[test_ind:].reshape(-1,1) # assumes y 1-d, change this if vector output

        # Print statements if verbose
        if verbose:
            print(f"Train index: 0 to {train_ind}")
            print(f"Validation index: {train_ind} to {test_ind}")
            print(f"Test index: {test_ind} to {self.hours}")
            print(f"X_train shape: {self.X_train.shape}, y_train shape: {self.y_train.shape}")
            print(f"X_val shape: {self.X_val.shape}, y_val shape: {self.y_val.shape}")
            print(f"X_test shape: {self.X_test.shape}, y_test shape: {self.y_test.shape}")
    def scale_data(self, verbose=True):
        if self.scaler is None:
            raise ValueError("Scaler is not set. Use 'set_scaler' method to set a scaler before scaling data.")
        if not hasattr(self, "X_train"):
            raise AttributeError("No X_train within object. Run train_test_split first. This is to avoid fitting the scaler with prediction data.")
        if verbose:
            print(f"Scaling data with scaler {self.scaler}, fitting on X_train")
        # Fit the scaler on the training data
        self.scaler.fit(self.X_train)      
        # Transform the data using the fitted scaler
        self.X_train = self.scaler.transform(self.X_train)
        if hasattr(self, 'X_val'):
            self.X_val = self.scaler.transform(self.X_val)
        self.X_test = self.scaler.transform(self.X_test)
    def __getattr__(self, key):
        try:
            return self[key]
        except KeyError:
            raise AttributeError(f"'rnn_data' object has no attribute '{key}'")

    def __setitem__(self, key, value):
        super().__setitem__(key, value)  # Update the dictionary
        if key in self.required_keys:
            super().__setattr__(key, value)  # Ensure the attribute is updated as well

    def __setattr__(self, key, value):
        self[key] = value    

In [None]:
d = RNNData(train['PIVC1_202401'], features_list=['Ed', 'Ew', 'rain'], scaler='minmax')

In [None]:
type(d.scaler)

In [None]:
d.scale_data()

In [None]:
d.X

In [None]:
d.train_test_split(train_frac = .5, val_frac = .1)

In [None]:
d.scale_data()

## Test RNN

In [None]:
# train.keys()

In [None]:
# case = [*train.keys()][1]
case = "FCHC1_202401"
print(case)

In [None]:
params.update({'val_frac': .2, 'scale': True, 'scaler': 'standard', 'epochs': 200})
# params.update({'features_list': ['wind', 'Ed', 'Ew', 'solar', 'rain']})
params.update({'rnn_layers': 3})
rnn_dat = create_rnn_data2(train[case], params)

In [None]:
reproducibility.set_seed()
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat)

In [None]:
errs

In [None]:
import importlib 
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN

In [None]:
params = read_yml("params.yaml", subkey="rnn")
params = RNNParams(params)

In [None]:
reproducibility.set_seed()
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat)

## LSTM

In [None]:
import importlib 
import moisture_rnn
importlib.reload(moisture_rnn)
from moisture_rnn import RNN_LSTM

In [None]:
with open("params.yaml") as file:
    params = yaml.safe_load(file)["lstm"]
    
rnn_dat2 = create_rnn_data2(train[case],params)

In [None]:
params.update({'epochs': 10})

In [None]:
reproducibility.set_seed()
rnn = RNN(params)
m, errs = rnn.run_model(rnn_dat2)

In [None]:
import importlib
importlib.reload(moisture_rnn)
from moisture_rnn import RNN_LSTM

In [None]:
with open("params.yaml") as file:
    params = yaml.safe_load(file)["lstm"]

rnn_dat2 = create_rnn_data2(train[case],params)
params

In [None]:
params.update({
    'learning_rate': 0.000001,
    'epochs': 10,
    'clipvalue':1.0
})

In [None]:
reproducibility.set_seed()
lstm = RNN_LSTM(params)
m, errs = lstm.run_model(rnn_dat2)