#### Preface
This notebook runs on the Tensorflow (V2.5.0) GPU and is optimized for it. It can be run on a CPU but results in training times approximately x10 longer.
Getting Tensorflow set up using a GPU can be a bit tricky if you're not familar with it but I strongly encourage you to spend the time and learn how. The time spent is saved in the long run if you a running on a proper GPU.

See the documentation on how to set it up https://www.tensorflow.org/install/gpu.

In [None]:
## Run this step if you are using a GPU
## Prevents running out of GPU memory
import tensorflow as tf

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

tf.config.list_physical_devices('GPU')

#### Dependencies and libraries
Currently, more packages than necessary for the model to run are loaded. This is to allow the user to experiment with different types of RNN structures, such as GRU and Conv1D.

In [None]:
## System functionality
import os
import glob
from functools import reduce
from datetime import datetime

## Database and mathematical operations
import pandas as pd
import numpy as np
import random
from random import sample

## Machine learning
# Tensorflow / Keras
from tensorflow.keras import layers, optimizers, metrics
from tensorflow.keras.layers import InputLayer, Conv1D, Dropout, LSTM, GRU, Dense
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import *
# SKlearn
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

## Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

#### Function definitions

In [None]:
## Cyclical learning rate (see https://arxiv.org/abs/1506.01186)
## Helps train the network faster and (sometimes) achieve better validation accuracy
class CyclicLR(Callback):
    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                gamma=1., scale_fn=None, scale_model='cycle'):
        super(CyclicLR, self).__init__()
        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode == 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0
        self.trn_iterations = 0
        self.history = {}
        self._reset()
        
    def _reset(self, new_base_lr=None, new_max_lr=None,
              new_step_size=None):
        # Resets cycle iterations
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}
        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())
            
    def on_batch_end(self, epoch, logs=None):
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1
        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)
        
        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
            
        K.set_value(self.model.optimizer.lr, self.clr())

In [None]:
## Train/validation/test data split
## This function uses dedicated validation and test probes and assigns the rest for training.
## Variables, not including soil moisture, are normalised.
def train_test_split(dataframe, val_probes, test_probes, start_date='2017-03-28',
                     end_date='2020-07-09'):
    df = dataframe.copy()
    df = df.set_index('ID', append=True).sort_index(level=1)
    train_probes = list(set(df.index.get_level_values('ID')))
    
    # Removes the validation and test probes from the dataframe
    for probe in test_probes:
        train_probes.remove(probe)
    for probe in val_probes:
        train_probes.remove(probe)
    
    date_index = pd.date_range(start=start_date, end=end_date, freq='D')
    record_length = len(date_index)
    
    # Sorts the dataframes by date and ID
    # This function ensures that all probes have an equal number of timesteps and that 
    def sort_dataframe(dataframe, probes):
        df_raw = dataframe.loc[pd.IndexSlice[:, probes], :]
        df_raw = df_raw.reset_index(level='ID', col_level=1)
        df_norm = pd.DataFrame()
        for probe in probes:
            df_ = df_raw[df_raw['ID'] == probe].copy()
            df_ = df_[df_['soil_moisture'].notna()]
            df_ = df_.reindex(index=date_index)
            df_['ID'] = probe    
            df_norm = df_norm.append(df_)
        df_norm.index.name = 'Date'
        df_norm.drop('ID', axis=1, inplace=True)
        return df_norm
        
    train_df = sort_dataframe(df, train_probes) 
    val_df = sort_dataframe(df, val_probes) 
    
    test_df = df.loc[pd.IndexSlice[:, test_probes], :].droplevel('ID') # copies the test probe data

    # Removes the targets (soil moisture) from the data before normalization
    train_target = (train_df.pop('soil_moisture').fillna(0).to_numpy()
                    .reshape(len(train_probes), record_length))
    val_target = (val_df.pop('soil_moisture').fillna(0).to_numpy()
                  .reshape(len(val_probes), record_length))
    test_target = (test_df.pop('soil_moisture').fillna(0).to_numpy()
                  .reshape(len(test_probes), record_length))

    # Normalizes the data 
    train_mean = train_df.mean()
    train_std = train_df.std()
    
    train_df = (train_df - train_mean) / train_std
    val_df = (val_df - train_mean) / train_std
    test_df = (test_df - train_mean) / train_std

    # Replacing NaNs with 0 for model interpretation
    train_df.fillna(0, inplace=True)
    val_df.fillna(0, inplace=True)
    test_df.fillna(0, inplace=True)

    # Converting the features to 3D numpy arrays
    train_data = np.stack(np.split(train_df.to_numpy(), len(train_probes)))
    val_data = np.stack(np.split(val_df.to_numpy(), len(val_probes)))
    test_data = np.stack(np.split(test_df.to_numpy(), len(test_probes)))
    
    return (train_data, train_target, val_data, val_target, test_data, test_target)

def k_fold_split(dataframe, test_probes, start_date='2017-03-28',
                 end_date='2020-07-09'):
    return 'test'
    

# Returns information about the dimensions of the model data
def data_stats(train_data):
    samples = train_data.shape[0] * train_data.shape[1]
    timesteps = train_data.shape[1]
    features = train_data.shape[2]
    batch_length = int(samples / timesteps)
    return (samples, timesteps, features, batch_length)

In [None]:
def plot_results(history, metric):
    training_metric = history.history[metric]
    validation_metric = history.history['val_{}'.format(metric)]

    epochs = range(1, len(training_metric) + 1)

    fig = plt.figure(figsize=(12, 6), dpi=100, facecolor='w', edgecolor='k')
    ax = fig.add_subplot()
    ax.set_facecolor('#f7f7f7')

    plt.plot(epochs, training_metric, 'b+', label='Training {}'.format(metric))
    plt.plot(epochs, validation_metric, 'royalblue', label='Validation {}'.format(metric))
    plt.title('Model training and validation {}'.format(metric), fontsize=16)
    plt.xlabel('Epoch')
    plt.ylabel(metric)

    plt.legend()
    plt.grid(axis='y')
    plt.show()

#### Local directories
Specify the local directory of your machine where the soil moisture .csv file is kept for **root_dir**.

#### Data formatting
The **test_train_split** function splits the dataframe into training, validation, and testing. Specify the stations used for validation and testing in arrays like ['station1', 'station2', etc.]

The **data_stats** function returns the dimensions of the training data, which is required for the model.

In [None]:
# Local parent folder of the soil moisture file
root_dir = r'C:\Users\Nick\Documents\_Master_Thesis\Soil_Moisture\ISMN'

# You may or may not need the date_parser argument to load the dates correctly.
# I recommend running the function first with the date_parser but if it throws you an error,
# just comment it out.
date_parse = lambda x: datetime.strptime(x, '%d/%m/%Y')
df = pd.read_csv(os.path.join(root_dir, 'SM_Data_REMEDHUS.csv'), index_col='Date', parse_dates=True,
                date_parser=date_parse)

# Returns normalized data split into training, validation, and testing arrays
# Specify the validation and test probes as arrays
X_train, Y_train, X_val, Y_val, X_test, Y_test = train_test_split(df, ['Canizal'], ['Carretoro'])

# Returns the dimensions of the data
samples, timesteps, features, batch_length = data_stats(X_train)

### Deep learning model - Recurrent Neural Network
*Note that some hyperparameters, such as recurrent_dropout, are not available when training with a GPU.*

#### Hyperparameters
Both **clr** and **l2_decay** are hyperparameters of the model, which can/should be tweaked between model runs. For **clr** see https://arxiv.org/abs/1506.01186 for a description about what the function does and how each parameter affects it.

The **epochs** are specified in the last block during the model.fit. This can/should be modified depending on the data. Larger datasets require more epochs, while a large number of epochs will result in overfitting on small datasets.

The **loss function**, **optimizer**, and **metrics** are used to assess the accuracy of the model during training and modify the weights accordingly. You should experiment with different types of optimizers (see https://keras.io/api/optimizers/) as there is no general rule for which is the better one, it varies between datasets. These are specified before fitting the model in model.compile.  

#### Layers
Finding the right hyperparameters are certainly important but the largest impact on the model will come from the number of layers used, the type, and the size of each layer. You should experiment with different types of layers (see https://keras.io/api/layers/recurrent_layers/ and https://keras.io/api/layers/convolution_layers/convolution1d/) and different sizes for the layers.

Following the layers in the default setup is a dropout layer, which disables random neurons in the model to reduce overfitting. This may or may not be advantageous for all datasets so experiment with different ratios (typically between 0.1 and 0.2) or disable them completely.

In [None]:
# Optimization and regularization hyperparameters
clr = CyclicLR(base_lr=1e-6, max_lr=0.002, step_size=250., mode='triangular', gamma=0.99995)
l2_decay = 1e-4

# Initializes the model and describes the dimensions of the data with the input_layer.
# This section should not be changed
model = Sequential()
model.add(InputLayer(input_shape=(timesteps, features)))

# An LSTM layer with recurrent regularization followed by a dropout layer
model.add(LSTM(8, return_sequences=True, recurrent_regularizer=l2(l2_decay)))
model.add(Dropout(0.1))

# An LSTM layer with recurrent regularization followed by a dropout layer
model.add(LSTM(8, return_sequences=True, recurrent_regularizer=l2(l2_decay)))
model.add(Dropout(0.1))

# A dense layer with a single node to fit the data back a desired format
# This should not be changed or removed
model.add(Dense(1))

# Compiles the model with the specified loss function, optimizer, and metrics.
# See the Keras/TF documentation for a description what the hyperparameters do and which other
# types can be used https://www.tensorflow.org/api_docs/python/tf/keras/Model
model.compile(loss=tf.keras.losses.MeanAbsoluteError(),
             optimizer='Adam',
             metrics=[metrics.MeanAbsoluteError()])

# Fits the compiled model using the training and validation data
# Implements any callback functions specified
history = model.fit(X_train, Y_train,
                    batch_size=1,
                    epochs=20,
                    callbacks=[clr, ],
                    validation_data=(X_val, Y_val))

In [None]:
# Plots the results from the model
# The available metrics that can be plotted depends on which metrics were used during model
# training. For the list of available metrics and their validation equivalent, uncomment and 
# run the function below
# print(history.history.keys())

plot_results(history, 'mean_absolute_error')

In [None]:
# Evaluates the model on the test data
prediction = model.evaluate(X_test, Y_test, batch_size=1)
print('Mean Squared Error: {}\nMean Absolute Error: {}%'
      .format(prediction[0], round(prediction[1]*100, 2)))