# LSTM model for observation operator

## Load in the required data

In [None]:
import os
from pathlib import Path
pad = Path(os.getcwd())
if pad.name == "ml_observation_operator":
    pad_correct = pad.parent
    os.chdir(pad_correct)
%run "ml_observation_operator/data_load_in.py"

## Load in used packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import hvplot
import hvplot.pandas
import hvplot.xarray
import pickle
import dask
import itertools
import tensorflow as tf
import random as python_random
from tensorflow import keras
from keras import models, layers
from functions.plotting_functions import ensemble_plot #plot_tf_history,
#from functions.pre_processing import reshape_data, reshaped_to_train_test
from functions.ml_utils import (general_tensorflow_model, validation_loop,
                                full_training_loop)
# from dask.distributed import Client
# client = Client(n_workers = 4, threads_per_worker = 2)
# display(client)
#guanantee reproducability
SEED =1234
#os.environ['PYTHONHASHSEED'] = str(SEED)
#os.environ['TF_DETERMINISTIC_OPS'] = '1'
python_random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

exec_training = False

print(os.getcwd())
%load_ext autoreload 
%autoreload 2 

Check the distance between the days: give the number of days from the previous observation as an extra feature!! Will be used in LSTM trainig (cf obsidian notes)
Choice was made from the previous and not the next as in this way, when predicting $y_t$ with $x_t$ as last input, the distance from $t-1$ to $t$ is used an not from $t$ to $t+1$

This is performed in the `load_in.py` function

In [None]:
ML_data_pad = Path("data/Zwalm_data/ML_data")
X_train_all = pd.read_pickle(ML_data_pad/"X_train_all.pkl")
X_test_all = pd.read_pickle(ML_data_pad/"X_test_all.pkl")
X_full_all = pd.read_pickle(ML_data_pad/"X_full_all.pkl")

X_train = pd.read_pickle(ML_data_pad/"X_train.pkl")
X_test = pd.read_pickle(ML_data_pad/"X_test.pkl")
X_full = pd.read_pickle(ML_data_pad/"X_full.pkl")
display(X_full.head())

X_train_small = pd.read_pickle(ML_data_pad/"X_train_small.pkl")
X_test_small = pd.read_pickle(ML_data_pad/"X_test_small.pkl")
X_full_small = pd.read_pickle(ML_data_pad/"X_full_small.pkl")
display(X_full_small.head())

y_train = pd.read_pickle(ML_data_pad/"y_train.pkl")
y_test = pd.read_pickle(ML_data_pad/"y_test.pkl")
y_full = pd.read_pickle(ML_data_pad/"y_full.pkl")

Cstar = pd.read_pickle(ML_data_pad/"Cstar.pkl")
display(Cstar.head())
display(X_full['delta_t'].hvplot())
average_deltat = X_full['delta_t'].mean()
print(f'Average delta t$ between observations: {average_deltat} days')

Not really regular timeseries...

https://stats.stackexchange.com/questions/312609/rnn-for-irregular-time-intervals

Note that this revisit time of 3 days is to be expected according to https://sentinels.copernicus.eu/web/sentinel/user-guides/sentinel-1-sar/revisit-and-coverage: *A single SENTINEL-1 satellite is potentially able to map the global landmasses in the Interferometric Wide swath mode once every 12 days, in a single pass (ascending or descending). The two-satellite constellation offers a 6 day exact repeat cycle at the equator. Since the orbit track spacing varies with latitude, the revisit rate is significantly greater at higher latitudes than at the equator.*

Full set of possible features: Forest, Pasture, Agriculture and a combination of pasture and agriculture

Normalising and reshaping of the data is executed in the `general_tensorflow_model` function

## Initial LSTM model attempt
A very simple LSTM model: size of hidden vector state = 10
followed by dense layer (with no dropout between)
Sequence length of 100 as first attempt. At an average of 3.604 days per year this is about a year of data! (analogous to idea of Kratzert 2018).

Trained 8 times, average validation and training R2 determined. Then trained on all data for optimal number of epochs (max 100 epochs) based on validation data. 

In [None]:
seq_length = 100
n_hidden_units = 10
n_features = X_train.shape[1]
lstm_model = models.Sequential(
    [
    layers.Input(shape = (seq_length, n_features)),
    layers.LSTM(n_hidden_units),
    layers.Dense(1, activation = 'linear')
    ]
)
lstm_model.summary()

#For later storage
pad = Path('data/ml_obs_op_data/lstm/lstm_intial')
if not os.path.exists(pad):
    os.makedirs(pad)


In [None]:
repeats = 8
epochs = 100
mean_r2_val, recommended_nr_epochs = validation_loop(
        lstm_model, repeats, X_train, X_test,y_train, y_test, Cstar,
        exec_training,pad,epochs = epochs,print_output = False,
        verbose = 0, validation_split = 0.2, lstm = True,
        seq_length = seq_length
)

Now repeat the code for 8 iterations and recommended number of epochs on the full set!

In [None]:
mean_r2_train, mean_r2_test, out_dict, models_list = full_training_loop(
 lstm_model, repeats,X_train, X_test,y_train,y_test, Cstar,
 exec_training, recommended_nr_epochs, pad, print_output = False,
 verbose = 0, validation_split=0.0, lstm = True,
    seq_length = seq_length
)

In [None]:
out_dict.keys()

So clearly is overfitting

In [None]:
fig, ax = plt.subplots(figsize = (9,7))
Cstar[X_full.index].plot(ax =ax)
ax.plot(out_dict['t_train'], out_dict['y_train_hat'], label = 'Train', alpha = 0.5, linestyle = '-')
ax.plot(out_dict['t_test'], out_dict['y_test_hat'], label = 'Test',alpha = 0.5, linestyle = ':')
ax.legend()
ax.set_ylabel('C* [mm]')

Make an ensmeble Figure

In [None]:
ensemble_plot(Cstar, X_full.index, out_dict, n_std= 1)

Test if the saved outputs and the output of the saved model are equivalent

In [None]:
models_list

In [None]:
model_nr = 6
test_model = models_list[model_nr]#out_dict['dask_out'][model_nr]['model']#models.load_model(pad/'lstm_initial_0')
test_model.summary()
result = general_tensorflow_model(
            test_model,X_train.values, X_test.values, y_train.values, 
            y_test.values, X_train.index, X_test.index, Cstar, lstm = True,
            seq_length = seq_length, epochs = recommended_nr_epochs, print_output = False,
            verbose = 0, validation_split = 0.0, training=False
        )
plt.plot(result['t_train'],result['y_train_hat'])
plt.plot(out_dict['t_train'],out_dict['y_train_hat'][:,model_nr])
#check equivalence with earlier calculated!
print(np.allclose(out_dict['y_train_hat'][:,model_nr].flatten(), result['y_train_hat'].flatten()))
plot_pd = pd.DataFrame({
    'y_train_hat_retrieved_model':result['y_train_hat'].flatten(),
    'y_train_hat_OG':out_dict['y_train_hat'][:,model_nr].flatten(),
    't':result['t_train'].flatten()
})
plot_pd = plot_pd.set_index('t')
display(plot_pd.hvplot())

## Model of Kratzert 2018

Number of internal units from 20 to 10 since twice the amount of feautures used here (10 instaed of 5). Default 20% validation dataset. 365 days is about 100 timesteps for this model

In [None]:
seq_length = 100
print(seq_length)
n_hidden_units = 10
kratzert_lstm = models.Sequential([
    layers.Input((seq_length, n_features)),
    layers.LSTM(units = n_hidden_units, name = 'lstm_1', return_sequences = True),
    layers.Dropout(rate =0.1),
    layers.LSTM(units = n_hidden_units, name = 'lstm_2'),
    layers.Dropout(rate =0.1),
    layers.Dense(units =1, activation = 'linear')
])
kratzert_lstm.summary()

#For later storage
pad = Path('data/ml_obs_op_data/lstm/lstm_kratzert')
if not os.path.exists(pad):
    os.makedirs(pad)

In [None]:
repeats = 8
epochs = 100
mean_r2_val, recommended_nr_epochs = validation_loop(
        kratzert_lstm, repeats, X_train, X_test,y_train, y_test, Cstar,
        exec_training,pad,epochs = epochs,print_output = False,
        verbose = 0, validation_split = 0.2, lstm = True,
        seq_length = seq_length
)

And again repeat on full dataset with optimal number of epochs

In [None]:
mean_r2_train, mean_r2_test, out_dict, models_list = full_training_loop(
 kratzert_lstm, repeats,X_train, X_test,y_train,y_test, Cstar,
 exec_training, recommended_nr_epochs, pad, print_output = False,
 verbose = 0, validation_split=0.0, lstm = True,
    seq_length = seq_length
)

In [None]:
print(f'Aveage R2 on training set: {mean_r2_train}')
print(f'Average R2 on test set: {mean_r2_test}')
ensemble_plot(Cstar, X_full.index, out_dict, n_std = 1)

So clearly an even more overfitted version! Indicates the seeming importance of regularisation or reducing the amount of parameters to be fitted

# Hyperparameter tuning

New idea: to limit computational burden just do a fixed number of validation Cstar to model!

- 1 layer of LSTM: deemed best by Kratzert (large sample hydrology), prevent overfitting 
- dropout between LSTM and last dense layer (analogous to Kratzert): 0, 0.2 or 0.4
- keep recurrent dropout to 0 
- hidden unit range:4,8,12,16
- choose between large and small dataset!
- keep or drop: forest feature, time feature
- keep deault learning rate of 1e-3 + experiment with 1e-2

In [None]:
#dask.config.set({'distributed.scheduler.default-task-durations': '1h'})
#For later storage
pad = Path('data/ml_obs_op_data/lstm/hyperparam_tuning')
if not os.path.exists(pad):
    os.makedirs(pad)

append = True

max_epochs = 100 #max number to try
n_val = 100 #irrespective of input sequence length, keep the same last 100 sequences as test data!
repeats = 4 #number of times to repeat training (idea of limiting computational load)
n_train = X_train.shape[0]
range_forest = [True, False]
range_time = [True, False]
range_size = ['large','small']
range_hidden_unit = np.arange(4,18,4)
range_input_seq = np.array([30,60,90])
range_learning_rate = [1e-3] #as in Kratzert
range_dropout_rate = [0,0.2,0.4] #dropout on the last output cell 
max_nr_options = len(range_time)*len(range_forest)*len(
    range_input_seq)*len(range_learning_rate)*len(
    range_hidden_unit)*len(range_dropout_rate)*len(range_size)
print(f"Total Number of combinations {max_nr_options}") 
col_names = ['data_in_size','seq_length','forest_bool','time_bool','hidden_units','learning_rate',
             'drop_out_rate','R2_val','recommended_nr_epochs']
if append:
     pd_hyperparam_val = pd.read_csv(pad/'lstm_hyperparam_validation.csv')
else:
    pd_hyperparam_val = pd.DataFrame(columns=col_names, index = range(0,max_nr_options))
iter = 0
input_full_combos= itertools.product(range_input_seq, range_forest, range_time)
input_small_combos = itertools.product(range_input_seq, range_time)
network_combos = itertools.product(range_hidden_unit, range_learning_rate, range_dropout_rate)

#the reoccuring loop
def lstm_repeating(X_temp_train, X_temp_test, network_combo, seq_length, n_features, iter):
        hidden_units, learning_rate, dropout_rate = network_combo
        model_name = 'lstm_' + str(iter)
        #out_list = []
        r2_list = []
        best_epoch_list = []
        for i in range(repeats):
            lstm_model = models.Sequential(
                [
                layers.Input(shape = (seq_length, n_features)),
                layers.LSTM(hidden_units),
                layers.Dropout(dropout_rate),
                layers.Dense(1, activation ='linear')
                ],
                name = model_name
            )
            result = general_tensorflow_model(
                lstm_model,X_temp_train.values, X_temp_test.values, y_train.values, 
                y_test.values, X_train.index, X_test.index, Cstar, lstm = True,
                seq_length = seq_length, epochs = max_epochs, print_output = False,
                verbose = 0, validation_split = n_val, learning_rate = learning_rate
            )
            r2_list.append(result['max_val_R2'])
            best_epoch_list.append(result['best_epoch'])
        mean_r2_val = np.mean(r2_list)
        recommended_nr_epochs = int(np.round(np.mean(best_epoch_list)))
        print(f'Mean R2 on validation set {mean_r2_val}')
        print(f'Recommended number of epochs: {recommended_nr_epochs}')
        pd_hyperparam_val.iloc[iter,:] = [size_set, seq_length, forest_bool, time_bool, hidden_units,
            learning_rate, dropout_rate, mean_r2_val, recommended_nr_epochs]
        return iter, pd_hyperparam_val

#all the options 
if exec_training:
    for size_set in range_size:
        if size_set == 'large':
            for input_combo in input_full_combos:
                print(input_combo)
                seq_length, forest_bool, time_bool = input_combo
                X_temp_train = X_train.copy()
                X_temp_test = X_test.copy()
                if not time_bool:
                    X_temp_train = X_temp_train.drop(['year_sin','year_cos'],axis = 1)
                    X_temp_test = X_temp_test.drop(['year_sin','year_cos'],axis = 1)   
                if not forest_bool:
                    X_temp_train = X_temp_train.loc[:,~X_temp_train.columns.str.endswith('Forest')] 
                    X_temp_test = X_temp_test.loc[:,~X_temp_test.columns.str.endswith('Forest')] 
                n_features = X_temp_train.shape[1]
                network_combos = itertools.product(range_hidden_unit, range_learning_rate, range_dropout_rate)
                for network_combo in network_combos:
                    print(network_combo)
                    if append:
                        if pd_hyperparam_val.iloc[iter,:].isna().all():
                            iter, pd_hyperparam_val = lstm_repeating(
                            X_temp_train, X_temp_test, network_combo,
                            seq_length, n_features, iter)
                    else:
                        iter, pd_hyperparam_val = lstm_repeating(
                               X_temp_train, X_temp_test, network_combo,
                               seq_length, n_features, iter)
                    iter = iter + 1
                    print(str(iter) + ' out of maximally ' + str(max_nr_options))
                    pd_hyperparam_val.to_csv(pad/'lstm_hyperparam_validation.csv', index = False)
        if size_set == 'small':
            for input_combo in input_small_combos:
                print(input_combo)
                seq_length, time_bool = input_combo
                X_temp_train = X_train_small.copy()
                X_temp_test = X_test_small.copy()
                if not time_bool:
                    X_temp_train = X_temp_train.drop(['year_sin','year_cos'],axis = 1)
                    X_temp_test = X_temp_test.drop(['year_sin','year_cos'],axis = 1)    
                n_features = X_temp_train.shape[1]
                network_combos = itertools.product(range_hidden_unit, range_learning_rate, range_dropout_rate)
                for network_combo in network_combos:
                    if append:
                        if pd_hyperparam_val.iloc[iter,:].isna().all():
                            iter, pd_hyperparam_val = lstm_repeating(
                            X_temp_train, X_temp_test, network_combo,
                            seq_length, n_features, iter)
                    else:
                        iter, pd_hyperparam_val = lstm_repeating(
                               X_temp_train, X_temp_test, network_combo,
                               seq_length, n_features, iter)
                    iter = iter + 1
                    print(str(iter) + ' out of maximally ' + str(max_nr_options))
                    pd_hyperparam_val.to_csv(pad/'lstm_hyperparam_validation.csv', index = False)

Idea: only the good model structures (based on their score on the validation set) will trained using the full training set!  

In [None]:
if exec_training:
    pd_hyperparam_val.to_csv(pad/'lstm_hyperparam_validation.csv', index = False)
else:
    pd_hyperparam_val = pd.read_csv(pad/'lstm_hyperparam_validation.csv')

In [None]:
pd_hyperparam_val_sorted = pd_hyperparam_val.sort_values('R2_val',ascending = False)
pd_hyperparam_val_sorted.head(15)

so time bool ofen dropped. larger models need more dropout as regularization!

Check out how the small model performs

In [None]:
pd_hyperparam_val_small = pd_hyperparam_val[pd_hyperparam_val['data_in_size']== 'small']
pd_hyperparam_val_small_sorted = pd_hyperparam_val_small.sort_values('R2_val',ascending=False)
pd_hyperparam_val_small_sorted

## Train the model with the highest validation score

In [None]:
parameters = pd_hyperparam_val_sorted.iloc[0,:].to_list()
print(parameters)
size_set, seq_length, forest_bool, time_bool, hidden_units, learning_rate, dropout_rate, r2_val, recommended_nr_epochs = parameters
if size_set == 'large':
    X_temp_train = X_train.copy()
    X_temp_test = X_test.copy()
elif size_set == 'small':
    X_temp_train = X_train_small.copy()
    X_temp_test = X_test_small.copy()
if not time_bool:
    X_temp_train = X_temp_train.drop(['year_sin','year_cos'],axis = 1)
    X_temp_test = X_temp_test.drop(['year_sin','year_cos'],axis = 1)   
if not forest_bool:
    X_temp_train = X_temp_train.loc[:,~X_temp_train.columns.str.endswith('Forest')] 
    X_temp_test = X_temp_test.loc[:,~X_temp_test.columns.str.endswith('Forest')] 
n_features = X_train.shape[1]
lstm_hyperparm = models.Sequential(
    [
    layers.Input((int(seq_length),n_features)),
    layers.LSTM(int(hidden_units)),
    layers.Dropout(dropout_rate),
    layers.Dense(1,'linear')    
    ]
)
lstm_hyperparm.summary()

In [None]:
recommended_nr_epochs

In [None]:
repeats = 8
pad = Path('data/ml_obs_op_data/lstm/hyperparam_tuning')
mean_r2_train, mean_r2_test, out_dict, models_list = full_training_loop(
 lstm_hyperparm, repeats,X_train, X_test,y_train,y_test, Cstar,
 exec_training, int(recommended_nr_epochs), pad, print_output = True,
 verbose = 0, validation_split=0.0, lstm = True,
    seq_length = int(seq_length)
)

In [None]:
ensemble_plot(Cstar, X_full.index, out_dict, n_std = 1)

In [None]:
recommended_nr_epochs

## Train the model with the highest validation score of the small dataset

In [None]:
parameters = pd_hyperparam_val_small_sorted.iloc[0,:].to_list()
print(parameters)
size_set, seq_length, forest_bool, time_bool, hidden_units, learning_rate, dropout_rate, r2_val, recommended_nr_epochs = parameters
if size_set == 'large':
    X_temp_train = X_train.copy()
    X_temp_test = X_test.copy()
elif size_set == 'small':
    X_temp_train = X_train_small.copy()
    X_temp_test = X_test_small.copy()
if not time_bool:
    X_temp_train = X_temp_train.drop(['year_sin','year_cos'],axis = 1)
    X_temp_test = X_temp_test.drop(['year_sin','year_cos'],axis = 1)   
if not forest_bool:
    X_temp_train = X_temp_train.loc[:,~X_temp_train.columns.str.endswith('Forest')] 
    X_temp_test = X_temp_test.loc[:,~X_temp_test.columns.str.endswith('Forest')] 
n_features = X_temp_train.shape[1]
lstm_hyperparm = models.Sequential(
    [
    layers.Input((int(seq_length),n_features)),
    layers.LSTM(int(hidden_units)),
    layers.Dropout(dropout_rate),
    layers.Dense(1,'linear')    
    ]
)
lstm_hyperparm.summary()

In [None]:
repeats = 8
pad = Path('data/ml_obs_op_data/lstm/hyperparam_tuning/small_models')
if not os.path.exists(pad):
    os.mkdir(pad)

mean_r2_train, mean_r2_test, out_dict, models_list = full_training_loop(
 lstm_hyperparm, repeats,X_train_small, X_test_small,y_train,y_test, Cstar,
 exec_training, int(recommended_nr_epochs), pad, print_output = True,
 verbose = 0, validation_split=0.0, lstm = True,
    seq_length = int(seq_length)
)

In [None]:
ensemble_plot(Cstar, X_full.index, out_dict, n_std = 1)

# Initial LSTM model without time features

Learning from working with the window data on Ridge Regression: you can exclude the sin wave for better performance! 

In [None]:
seq_length = 100
n_hidden_units = 10
n_features = X_train.shape[1]
#drop sin and cos features
X_train_nt = X_train.drop(['year_sin','year_cos'],axis = 1)
X_test_nt = X_test.drop(['year_sin','year_cos'],axis =1)
n_features = X_train_nt.shape[1]
lstm_model = models.Sequential(
    [
    layers.Input(shape = (seq_length, n_features)),
    layers.LSTM(n_hidden_units),
    layers.Dense(1, activation = 'linear')
    ]
)
lstm_model.summary()

#For later storage
pad = Path('data/ml_obs_op_data/lstm/lstm_intial_nt')
if not os.path.exists(pad):
    os.makedirs(pad)

In [None]:
repeats = 8
epochs = 100
mean_r2_val, recommended_nr_epochs = validation_loop(
        lstm_model, repeats, X_train_nt, X_test_nt,y_train, y_test, Cstar,
        exec_training,pad,epochs = epochs,print_output = False,
        verbose = 0, validation_split = 0.2, lstm = True,
        seq_length = seq_length
)

In [None]:
mean_r2_train, mean_r2_test, out_dict, models_list = full_training_loop(
 lstm_model, repeats,X_train_nt, X_test_nt,y_train,y_test, Cstar,
 exec_training, recommended_nr_epochs, pad, print_output = False,
 verbose = 0, validation_split=0.0, lstm = True,
    seq_length = seq_length
)

In [None]:
ensemble_plot(Cstar, X_full.index, out_dict,n_std = 1)