# SCRIPT 06: Train Model with Tenfold Cross-Validation

This is the sixth script in the methodology. Here, models are trained with tensorold crossvalidation in order to compare hyperparparameters and decide which are the best combination available. The models hyperparameters to be tested are defined by the user in a list of dictionaries.

**ATTENTION**: with around 5000 samples, the training is very slow even with good GPUs, and since it must be repeated 10 times for each hyperparameters combination, it can take a long time run this script according to the number combinations to be tested. Be mindful of the tests you want to do.

In the following cells, please refer to the comments in the code for further explanations of its functioning.

In [None]:
# importing packages
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tqdm import tqdm

import os

from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

import time

In [None]:
# example of hyperparameters definitions to train
models_to_train = [{'Model ID': '01',
                    'Learning Rate': 0.001,
                    'Epochs': 50},
                   
                   {'Model ID': '02',
                    'Learning Rate': 0.0001,
                    'Epochs': 50},
                   
                   {'Model ID': '03',
                    'Learning Rate': 0.00001,
                    'Epochs': 50},
                   
                   {'Model ID': '04',
                    'Learning Rate': 0.001,
                    'Epochs': 100},
                   
                   {'Model ID': '05',
                    'Learning Rate': 0.0001,
                    'Epochs': 100},
                   
                   {'Model ID': '06',
                    'Learning Rate': 0.00001,
                    'Epochs': 100},
                   
                   {'Model ID': '07',
                    'Learning Rate': 0.001,
                    'Epochs': 200},
                   
                   {'Model ID': '08',
                    'Learning Rate': 0.0001,
                    'Epochs': 200},
                   
                   {'Model ID': '09',
                    'Learning Rate': 0.00001,
                    'Epochs': 200}]

In [None]:
# defining the samples folder and the saples id to use
samples_folder = '/home/bruno.matosak/Semiarido/MultiInput/samples/sample_data'
samples_id = '5K'

trainings_folder = '/home/bruno.matosak/Semiarido/MultiInput/trainings'

In [None]:
# load training data
reference = np.load(os.path.join(samples_folder, f'{samples_id}_PRO_reference.npy'))
print('Shape Reference:', reference.shape)
samples_s1_y = np.load(os.path.join(samples_folder, f'{samples_id}_PRO_s1_y.npy'))
print('Shape S1 Y:', samples_s1_y.shape)
samples_s2_y = np.load(os.path.join(samples_folder, f'{samples_id}_PRO_s2_y.npy'))
print('Shape S2 Y:', samples_s2_y.shape)
samples_s1_m = np.load(os.path.join(samples_folder, f'{samples_id}_PRO_s1_m.npy'))
print('Shape S1 M:', samples_s1_m.shape)
samples_s2_m = np.load(os.path.join(samples_folder, f'{samples_id}_PRO_s2_m.npy'))
print('Shape S2 M:', samples_s2_m.shape)

In [None]:
# defining model

# ConvLSTM block
def ConvLSTM_block(input_tensor, num_filters, return_sequences):
    encoder = tf.keras.layers.ConvLSTM2D(filters=num_filters, 
                                         kernel_size=(3, 3),
                                         data_format='channels_last',
                                         recurrent_activation='hard_sigmoid',
                                         activation='tanh',
                                         padding='valid',
                                         dropout=0.3,
                                         recurrent_dropout=0.3,
                                         return_sequences=return_sequences)(input_tensor)
    encoder = tf.keras.layers.BatchNormalization()(encoder)
    encoder = tf.python.keras.layers.Activation('relu')(encoder)
    return encoder

# Simple Convolutional Block
def simple_conv_block(input_tensor, num_filters):
    encoder = tf.python.keras.layers.Conv2D(num_filters, (3, 3), padding='valid')(input_tensor)
    encoder = tf.python.keras.layers.BatchNormalization()(encoder)
    encoder = tf.python.keras.layers.Activation('relu')(encoder)
    encoder = tf.python.keras.layers.Dropout(0.3)(encoder)
    return encoder

# Normal Convolutional Block
def conv_block(input_tensor, num_filters):
    encoder = tf.python.keras.layers.Conv2D(num_filters, (3, 3), padding='valid')(input_tensor)
    encoder = tf.python.keras.layers.BatchNormalization()(encoder)
    encoder = tf.python.keras.layers.Activation('relu')(encoder)
    encoder = tf.python.keras.layers.Conv2D(num_filters, (3, 3), padding='valid')(encoder)
    encoder = tf.python.keras.layers.BatchNormalization()(encoder)
    encoder = tf.python.keras.layers.Activation('relu')(encoder)
    encoder = tf.python.keras.layers.Dropout(0.3)(encoder)
    return encoder

# Encoder
def encoder_block(input_tensor, num_filters):
    encoder = conv_block(input_tensor, num_filters)
    encoder_pool = tf.python.keras.layers.MaxPooling2D((2, 2), strides=(2, 2))(encoder)
    return encoder_pool, encoder

# Decoder
def decoder_block(input_tensor, concat_tensor, num_filters, val_crop):
    decoder = tf.python.keras.layers.Conv2DTranspose(num_filters, (2, 2), strides=(2, 2), padding='same')(input_tensor)
    decoder = tf.python.keras.layers.concatenate([tf.keras.layers.Cropping2D(cropping=(val_crop, val_crop))(concat_tensor), decoder], axis=-1)
    decoder = tf.python.keras.layers.BatchNormalization()(decoder)
    decoder = tf.python.keras.layers.Activation('relu')(decoder)
    decoder = tf.python.keras.layers.Conv2D(num_filters, (3, 3), padding='valid')(decoder)
    decoder = tf.python.keras.layers.BatchNormalization()(decoder)
    decoder = tf.python.keras.layers.Activation('relu')(decoder)
    decoder = tf.python.keras.layers.Conv2D(num_filters, (3, 3), padding='valid')(decoder)
    decoder = tf.python.keras.layers.BatchNormalization()(decoder)
    decoder = tf.python.keras.layers.Activation('relu')(decoder)
    decoder = tf.python.keras.layers.Dropout(0.3)(decoder)
    return decoder

# Function to create the model
def get_model(n_times, chip_size, n_bands_s1, n_bands_s2, out_num_channels, learning_rate):
    # organizes input
    
    # input 1: Sentinel-1 year reduction data
    input1 = tf.python.keras.layers.Input(shape=[chip_size, chip_size, n_bands_s1])
    # input 2: Sentinel-2 year reduction data
    input2 = tf.python.keras.layers.Input(shape=[chip_size, chip_size, n_bands_s2])
    # input 3: Sentinel-1 monthly reductions data
    input3 = tf.python.keras.layers.Input(shape=[n_times, chip_size, chip_size, n_bands_s1])
    # input 4: Sentinel-2 monthly reductions data
    input4 = tf.python.keras.layers.Input(shape=[n_times, chip_size, chip_size, n_bands_s2])
    
    # applies the simple convolution block to yearly redcution data
    conv_s1 = simple_conv_block(input1, n_bands_s1)
    conv_s2 = simple_conv_block(input2, n_bands_s2)
    
    # applies the ConvLSTM block to monthly reductions data (they are time series)
    conv_lstm_s1 = ConvLSTM_block(input_tensor = input3,
                                  num_filters = n_bands_s1,
                                  return_sequences = False)
    conv_lstm_s2 = ConvLSTM_block(input_tensor = input4,
                                  num_filters = n_bands_s2,
                                  return_sequences = False)
    
    # concatenate the result of previous simple convolution and ConvLSTM blocks
    concatenated = tf.python.keras.layers.concatenate([conv_s1, conv_s2, conv_lstm_s1, conv_lstm_s2], axis=-1)
    
    # makes the data dimension contraction and expanding paths
    encoder0_pool, encoder0 = encoder_block(concatenated, 16) # 128
    encoder1_pool, encoder1 = encoder_block(encoder0_pool, 32) # 64
    encoder2_pool, encoder2 = encoder_block(encoder1_pool, 64) # 32
    encoder3_pool, encoder3 = encoder_block(encoder2_pool, 128) # 16
    center = conv_block(encoder3_pool, 256) # center
    decoder3 = decoder_block(center, encoder3, 128, 4) # 64
    decoder2 = decoder_block(decoder3, encoder2, 64, 16) # 64
    decoder1 = decoder_block(decoder2, encoder1, 32, 40) # 128
    decoder0 = decoder_block(decoder1, encoder0, 16, 88) # 256
    
    # defines the output layer
    outputs = tf.keras.layers.Conv2D(filters=out_num_channels, kernel_size=(1,1), activation="softmax", padding='same')(decoder0)

    # defines the model
    model = tf.python.keras.models.Model(inputs=[input1, input2, input3, input4], outputs=[outputs])

    # compiles the model
    model.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  metrics=[tf.keras.metrics.CategoricalAccuracy(), 'accuracy'])
    
    # returns our pretty model
    return model

In [None]:
# in this cell, functions are defined to generate grpahs showing accuracy, cateforical accuracy and
# loss data obtained during every single training. it is used to observe how these metrics evolve during
# the training phase and help indicate under- and over-fitting.

# plot accuracy graph
def plot_acc(h, save_folder, epochs):
    plt.figure(figsize=(10,5))
    train = h.history['accuracy']
    val = h.history['val_accuracy']
    epochs_ = np.arange(1, epochs+1)
    plt.plot(epochs_, train, 'g', label='Training Accuracy')
    plt.plot(epochs_, val, 'b', label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid()
    plt.tight_layout()
    plt.savefig(os.path.join(save_folder, 'accuracy.png'), facecolor='white')
    plt.close()

# plot categorical accuracy graph
def plot_cat_acc(h, save_folder, epochs):
    plt.figure(figsize=(10,5))
    train = h.history['categorical_accuracy']
    val = h.history['val_categorical_accuracy']
    epochs_ = np.arange(1, epochs+1)
    plt.plot(epochs_, train, 'g', label='Training Categorical Accuracy')
    plt.plot(epochs_, val, 'b', label='Validation Categorical Accuracy')
    plt.title('Training and Validation Categorical Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Categorical Accuracy')
    plt.legend()
    plt.grid()
    plt.tight_layout()
    plt.savefig(os.path.join(save_folder, 'categorical_accuracy.png'), facecolor='white')
    plt.close()

# plot loss graph
def plot_loss(h, save_folder, epochs):
    plt.figure(figsize=(10,5))
    train = h.history['loss']
    val = h.history['val_loss']
    epochs_ = np.arange(1, epochs+1)
    plt.plot(epochs_, train, 'g', label='Training loss')
    plt.plot(epochs_, val, 'b', label='Validation loss')
    plt.title('Training and Validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid()
    plt.tight_layout()
    plt.savefig(os.path.join(save_folder, 'loss.png'), facecolor='white')
    plt.close()

In [None]:
# function to help up write messages to a log file
def write_log_message(message):
    with open('training_log.txt', 'a') as f:
        f.write(message)

In [None]:
# in this cell, the training is done

# here, the samples are divided in 10 parts to the tenfold cross-validation. each iteration is completed entirely
# for all models before going to the next. some problem may appear and interrupt training, like hardware 
# malfunction, so all the process could be compromised, however, the samples division is made randomly according
# to a seed number, so the samples groups are reproducible, and the training is done to the iteration numbers defined
# by the user. this means that if some error occur and the training is interrupted, it does not need to be restarted
# from the beginning, but from the iteration in which the error occurred. the iteration also do not need to be
# repeated for models that the training was already completed for it, it can be started from the model that was
# being trained when the malfunctioning happened. the script checks which model has a model file for the iteration
# and skips it. folders with a Model.h5 file indicate that the training for it has already been completed.

# to verify if training was interrupted, check the log file and see if its last modification was made an unreasonable
# time ago, but also consider that some trainings take a really long time to complete. you can also verify the GPU and
# CPU usage to see if the training was interrupted. furthermore, any other source of information available can be used 
# to check if traning is happening appropriately.

# direct recomendations: if training is interrupted, change the values in 'iterations_to_train' to contain only the
# ones left to be completed, so it can skip some iterations and save some time loading samples.

# the iterations left to be completed
iterations_to_train = [1,2,3,4,5,6,7,8,9,10]

# iterations counter (do not change it)
iteration_number = 1

# the samples ids, used to index all filed and reorganize them in 10 groups
ids = np.arange(len(reference))

# KFold function used to create the 10 groups of samples for tenfold cross-validation
kfolder = KFold(10, shuffle=True, random_state=1)

# loop for everyone of the 10 iterations of samples combination
for train_ids, test_ids in kfolder.split(ids):
    # checks if current iteration is in iterations_to_train
    if iteration_number in iterations_to_train:
        # write messate to log file
        write_log_message('==============================================================\n')
        write_log_message(f'Iteration {iteration_number}\n')

        # separates samples and its references between train and validate for this iteration
        # of the tenfold cross-validation process.
        samples_train = [samples_s1_y[train_ids], samples_s2_y[train_ids], samples_s1_m[train_ids], samples_s2_m[train_ids]]
        samples_test = [samples_s1_y[test_ids], samples_s2_y[test_ids], samples_s1_m[test_ids], samples_s2_m[test_ids]]
        reference_train = reference[train_ids]
        reference_test = reference[test_ids]

        # loop to train this iteration of every model defined in 'models_to_train'
        for mtt in models_to_train: # mtt: model to train
            # defines model folder
            model_folder = os.path.join(trainings_folder, f'model_{mtt["Model ID"]}/iteration_{str(iteration_number).zfill(2)}')
            # train if model file does not exist in folder. if model file is present, the training
            # was already completed for this iteration.
            if not os.path.exists(model_folder+'/Model.h5'):
                
                # start time for later elapsed time calculations
                t1 = time.time()
                
                # write a message to log file
                write_log_message(f'Model {mtt["Model ID"]}... ')
                
                # defines model using previously defined function
                model = get_model(n_times = samples_s1_m.shape[1],
                                  chip_size = samples_s1_m.shape[2],
                                  n_bands_s1 = samples_s1_y.shape[-1],
                                  n_bands_s2 = samples_s2_y.shape[-1],
                                  out_num_channels = reference.shape[-1],
                                  learning_rate=mtt['Learning Rate'])

                # make folder to acomodate training files
                os.makedirs(model_folder, exist_ok = True)
                # log folder
                logdir = os.path.join(model_folder, 'fit')
                os.makedirs(logdir, exist_ok = True)
                # tensorboard callback
                tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir,
                                                                      histogram_freq = 1,
                                                                      profile_batch = (2,10))
                
                # the most important line of code. here the training is done. saves training statistics
                # per epoch to history_ for later plotting.
                history_ = model.fit( samples_train, reference_train,
                             validation_data=(samples_test, reference_test),
                             batch_size=16,
                             epochs=mtt['Epochs'],
                             verbose=0,
                             callbacks=[tensorboard_callback])

                # plotting training statistics per epoch to files, saved in model_folder
                plot_acc(history_, model_folder, mtt['Epochs'])
                plot_cat_acc(history_, model_folder, mtt['Epochs'])
                plot_loss(history_, model_folder, mtt['Epochs'])

                # saving model to file
                model.save(os.path.join(model_folder, 'Model.h5'))

                # gets prediction over validation data
                pred = tf.argmax(model.predict(samples_test, batch_size=25), -1)

                # calculate confusion matrix for valication data, predicted previously, and save it to model_folder.
                cm = confusion_matrix(np.asarray(tf.argmax(reference_test, -1)).ravel(), np.asarray(pred).ravel())
                np.save(os.path.join(model_folder, 'test_confusion_matrix.npy'), cm)

                # clear tensorflow session, so it can start a new training anew.
                tf.keras.backend.clear_session()
                
                # gets elapsed time and writes it to log file.
                t2 = time.time()
                write_log_message('Time for training: %.3f minutes.\n' % ((t2-t1)/60))

        # break line in log file
        write_log_message('\n')
    # adds to iteration number
    iteration_number += 1