#### Preface
This notebook runs on the Tensorflow (V2.4.0) GPU and is optimized for it. It can be run on a CPU but results in training times approximately x10 longer.
Getting Tensorflow set up using a GPU can be a bit tricky if you're not familar with it but I strongly encourage you to spend the time and learn how. The time spent is saved in the long run if you a running on a proper GPU.

See the documentation on how to set it up https://www.tensorflow.org/install/gpu.

In [None]:
## Run this step if you are using a GPU
## Prevents running out of GPU memory
import tensorflow as tf

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.compat.v1.Session(config=config)

tf.config.list_physical_devices('GPU')

#### Dependencies and libraries
Currently, more packages than necessary for the model to run are loaded. This is to allow the user to experiment with different types of RNN structures, such as GRU and Conv1D.

In [3]:
## System functionality
import os
import glob
from functools import reduce

## Database and mathematical operations
import pandas as pd
import numpy as np
import random
from random import sample

## Machine learning
# Tensorflow / Keras
from tensorflow.keras import layers, optimizers, metrics
from tensorflow.keras.layers import InputLayer, Conv1D, Dropout, LSTM, GRU, Dense
from tensorflow.keras.regularizers import l1, l2, l1_l2
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import *
# SKlearn
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error

## Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

#### Function definitions

In [None]:
## Cyclical learning rate (see https://arxiv.org/abs/1506.01186)
## Helps train the network faster and (sometimes) achieve better validation accuracy
class CyclicLR(Callback):
    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                gamma=1., scale_fn=None, scale_model='cycle'):
        super(CyclicLR, self).__init__()
        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode == 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0
        self.trn_iterations = 0
        self.history = {}
        self._reset()
        
    def _reset(self, new_base_lr=None, new_max_lr=None,
              new_step_size=None):
        # Resets cycle iterations
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}
        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())
            
    def on_batch_end(self, epoch, logs=None):
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1
        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)
        
        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
            
        K.set_value(self.model.optimizer.lr, self.clr())

In [4]:
start_date, end_date = '2017-03-28', '2020-07-09'
date_index = pd.date_range(start=start_date, end=end_date, freq='D')
len(date_index)

1200

In [2]:
## Train/validation/test data split
## This function uses dedicated validation and test probes and assigns the rest for training.
## Variables, not including soil moisture, are normalised.
def train_test_split(dataframe, val_probes, test_probes, start_date='2017-03-28',
                     end_date='2020-07-09'):
    df = dataframe.copy()
    df = df.set_index('ID', append=True).sort_index(level=1)
    train_probes = list(set(df.index.get_level_values('ID')))
    
    # Removes the validation and test probes from the dataframe
    for probe in test_probes:
        train_probes.remove(probe)
    for probe in val_probes:
        train_probes.remove(probe)
    
    date_index = pd.date_range(start=start_date, end=end_date, freq='D')
    record_length = len(date_index)
    
    # Sorts the dataframes by date and ID
    def sort_dataframe(dataframe, probes):
        df_raw = dataframe.loc[pd.IndexSlice[:, probes], :]
        df_raw = df_raw.reset_index(level='ID', col_level=1)
        df_norm = pd.DataFrame()
        for probe in probes:
            df_ = df_raw[df_raw['ID'] == probe].copy()
            df_ = df_[df_['soil_moisture'].notna()]
            df_ = df_.reindex(index=date_index)
            df_['ID'] = probe    
            df_norm = df_norm.append(df_)
        df_norm.index.name = 'Date'
        df_norm.drop('ID', axis=1, inplace=True)
        return df_norm
        
    train_df = sort_dataframe(df, train_probes) 
    val_df = sort_dataframe(df, val_probes) 
    
    test_df = df.loc[pd.IndexSlice[:, test_probes], :].droplevel('ID') # copies the test probe data

    train_target = (train_df.pop('soil_moisture').fillna(0).to_numpy()
                    .reshape(len(train_probes), record_length))
    val_target = (val_df.pop('soil_moisture').fillna(0).to_numpy()
                  .reshape(len(val_probes), record_length))
    test_target = (test_df.pop('soil_moisture').fillna(0).to_numpy()
                  .reshape(len(test_probes), record_length))

    # Normalizes the data 
    train_mean = train_df.mean()
    train_std = train_df.std()
    
    train_df = (train_df - train_mean) / train_std
    val_df = (val_df - train_mean) / train_std
    test_df = (test_df - train_mean) / train_std

    # Replacing NaNs with 0 for model interpretation
    train_df.fillna(0, inplace=True)
    val_df.fillna(0, inplace=True)
    test_df.fillna(0, inplace=True)

    # Converting the features to 3D numpy arrays
    train_data = np.stack(np.split(train_df.to_numpy(), len(train_probes)))
    val_data = np.stack(np.split(val_df.to_numpy(), len(val_probes)))
    test_data = np.stack(np.split(test_df.to_numpy(), len(test_probes)))
    
    return (train_data, train_target, val_data, val_target, test_data, test_target, test_probes)

def k_fold_split(dataframe, test_probes, start_date='2017-03-28',
                 end_date='2020-07-09'):
    

# Returns information about the dimensions of the model data
def data_stats(train_data):
    samples = train_data.shape[0] * train_data.shape[1]
    timesteps = train_data.shape[1]
    features = train_data.shape[2]
    batch_length = int(samples / timesteps)
    return (samples, timesteps, features, batch_length)

In [None]:
def plot_results(history, metric)
    training_metric = history.history[metric]
    validation_metric = history.history['val_{}'.format(metric)]

    epochs = range(1, len(training_metric) + 1)

    fig = plt.figure(figsize=(12, 6), dpi=100, facecolor='w', edgecolor='k')
    ax = fig.add_subplot()
    ax.set_facecolor('#f7f7f7')

    plt.plot(epochs, training_metric, 'b+', label='Training {}'.format(metric))
    plt.plot(epochs, validation_metric, 'royalblue', label='Validation {}'.format(metric))
    plt.title('Model training and validation {}'.format(metric), fontsize=16)
    plt.xlabel('Epoch')
    plt.ylabel(metric)

    plt.legend()
    plt.grid(axis='y')
    plt.show()