### Author: Md Fahim Hasan
### Work Email: mdfahim.hasan@bayer.com

__Install pytorch first if not already installed__

In [None]:
# !pip3 install torch torchvision torchaudio

In [1]:
import numpy as np
import pandas as pd
from pickle import dump, load

from ipynb.fs.full.general_utils import *
from ipynb.fs.full.ML_utils import *

from sklearn.preprocessing import MinMaxScaler

In [2]:
# torch.cuda.is_available()

### Early Stopping class
__This script was taken from this [GitHub Repo](https://github.com/Bjarten/early-stopping-pytorch/blob/master/pytorchtools.py).__

In [None]:
import os
import torch
import numpy as np

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, save_folder, savename, patience=7, verbose=False, delta=0, trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.save_folder = save_folder
        self.savename = savename
        self.trace_func = trace_func
    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...')
        
        savepath = os.path.join(self.save_folder, self.savename + '.pt')
        torch.save(model.state_dict(), savepath)
        self.val_loss_min = val_loss

### Neural network Class

#### comment on `Early Stopping`:

I implemented something `unconventional` in `early stopping` in the neural network code. From `maximum temperature` deep learning modeling, I noticed that the training stops early with conventional early stopping. This happens because at a random epoch the validation loss might get very low and the model uses that validation loss to check early stopping criteria. At that point, the training loss might not be minimized/stable/generalized and there is large gap between train and validation loss. I ran models for a good number of epochs without early stopping and noticed that even after that very low (sudden) validation loss point, training loss decreases and validation loss becomes more generalized (the gap between train and valiation loss decreases/closes). In my thought, we should let the model train for some epochs and then start monitoring `early stopping`. That's why I incorporated `start_EarlyStop_count_from_epoch` argument in the `train_model()` function for the neural network. The default value of `start_EarlyStop_count_from_epoch` is set to 0 so that early stopping can be implemented in a conventional way from the very first epoch.

I experimented with different model architecture, learning ratesa, weight decay, batch_size, train-validation-test ratio.May be a optimal architectue (which i couldn't get to) and hyperparameters set will provide better performance and `conventional early stopping` can be implemented instead of `unconventional early stopping`.

In [3]:
class NeuralNetwork(torch.nn.Module):
    """
    A Neural Network Class for nonlinear regression type model. Creates model with user defined feed forward networks.

    Methods:
        initialize_weights(): Initializes weight for the Neural Network model.
        to_torch(): Convert numpy array to torch.Tensor.
        standardize(): Standardizes an input torch Tensor.
        _forward(): Calculates outputs of each layer given inputs in X.
        train(): Trains the model with given training and observed data.
                 ** This method() will not be used in our model training.
        distribute_T_for_backprop(): Distributes observed data to each pixel/sample for backpropagation purpose.
        train_with_distributed_T(): Trains the model with given training and observed data in a distributed approach
                                    (Observed data is distributed to each pixel/sample in each epoch before
                                    initiating backpropagation).
        predict(): Uses trained model to predict on given data.
        get_performance_trace(): Provides model loss for each epoch.
    """
    
    def __init__(self, n_inputs, n_hiddens_list, n_outputs, dropout=0.1, activation_func='tanh', device='cuda'):
        """
        Creates a neural network with the given structure.

        :param n_inputs: int. Number of attributes/predictors that will be used in the model.
        :param n_hiddens_list: list. A list of number of units in each hidden layer. Each member of the list represents one
                               hidden layer.
        :param n_outputs: int. Number of output/prediction. Generally 1.
        :paran dropout : float. Dropout value. Default set to 0.1.
        :param activation_func: str. Name of the activation function. Can take 'tanh'/'relu'/'leakyrelu'.
        :param device: str. Name of the device to run the model. Either 'cpu'/'cuda'.
        """
        # Call parent class (torch.nn.Module) constructor
        super().__init__()

        self.device = device
        print(f'Model running on {device}....')

        # For printing
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        self.n_hidden_layers = n_hiddens_list

        # To build list of layers, must use torch.nn.ModuleList, not plain list
        self.hidden_layers = torch.nn.ModuleList()

        if activation_func == 'tanh':
            self.activation_func = torch.nn.Tanh()
        elif activation_func == 'relu':
            self.activation_func = torch.nn.ReLU()
        elif activation_func == 'leakyrelu':
            self.activation_func = torch.nn.LeakyReLU()
        else:
            raise Exception("Activation function should be 'tanh'/'relu'/'leakyrelu'")

        for nh in n_hiddens_list:
            self.hidden_layers.append(torch.nn.Sequential(
                torch.nn.Dropout(dropout),
                torch.nn.Linear(n_inputs, nh),
                self.activation_func))

            n_inputs = nh  # output of each hidden layer will be input of next hidden layer

        self.output_layer = torch.nn.Linear(n_inputs, n_outputs)
        self.initialize_weights()
        self.to(self.device)  # transfers the whole thing to 'cuda' if device='cuda'

        
    def __repr__(self):
        return 'NeuralNetwork({}, {}, {}, activation func={})'.format(self.n_inputs, self.n_hidden_layers,
                                                                      self.n_outputs, self.activation_func)
    def __str__(self):
        s = self.__repr__()
        if self.n_epochs > 0:  # self.total_epochs
            s += '\n Trained for {} epochs.'.format(self.n_epochs)
            s += '\n Final standardized training error {:.4g}.'.format(self.performance_trace[-1])
        return s
    
    def initialize_weights(self):
        """
        Initializes weight for the Neural Network model. For 'tanh' initializing method is 'xavier_normal'. For 'relu' and
        'leakyrelu' initialization method is 'kaiming_normal'.
        """
        for m in self.modules():
            if isinstance(m, torch.nn.Linear):
                if isinstance(self.activation_func, torch.nn.Tanh):
                    torch.nn.init.xavier_normal_(m.weight)
                elif isinstance(self.activation_func, torch.nn.ReLU):
                    torch.nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                elif isinstance(self.activation_func, torch.nn.LeakyReLU):
                    torch.nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu')

    def to_torch(self, M, torch_type=torch.FloatTensor):
        """
        Convert numpy array to torch Tensor.

        :param M: numpy array.
        :param torch_type: torch data type. Default set to Float.

        :return: A torch.Tensor.
        """
        if not isinstance(M, torch.Tensor):
            M = torch.from_numpy(M).type(torch_type).to(self.device)
        return M
    
    def forward(self, X):
        """
        Calculates outputs of each layer given inputs in X.

        :param X: torch.Tensor. Standardized input array representing attributes as columns and samples as row.

        :return: torch.Tensor. Standardized output array representing model prediction.
        """
        Y = X
        for hidden_layers in self.hidden_layers:
            Y = hidden_layers(Y)  # going through hidden layers

        # Final output
        Y = self.output_layer(Y)

        return Y
    
def standardize(M):
    """
    Standardizes an input torch Tensor.

    :param M: torch.tensor.

    :return: Standardized torch Tensor, mean, and standard deviation values.
    """
    M_means = torch.mean(M, dim=0, keepdim=False)
    M_stds = torch.std(M, dim=0, keepdim=False)

    Ms = (M - M_means) / M_stds

    return Ms, M_means, M_stds

def normalize(M):
    scaler = MinMaxScaler(feature_range=(0, 1))
    normalized_data = scaler.fit_transform(M)
    
    return normalized_data, scaler
    
def create_dataloader(x, y, batch_size=64):
    features = torch.tensor(np.array(x)).float()
    target = torch.tensor(np.array(y)).float()

    dataset = TensorDataset(features, target)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

    return dataloader
        
    
def train_model(model, train_dataloader, validation_dataloader, n_epochs, 
                save_folder, savename, 
                early_stop=False, start_EarlyStop_count_from_epoch=0, patience=3, 
                method='adam', learning_rate=0.01, weight_decay=0.01, device='cuda', 
                verbose=True, standardization=True):
    """
    Trains the model with given training and observed data.
    ** This method() will not be used in our model training.

    :param model: model object coming from NeuralNetwork class.
    :param train_dataloader : train_dataloader object.
    :param validation dataloader : validation_dataloader object. 
    :param n_epochs: int. Number of passes to take through all samples.
    :param save_folder : Filepath of folder to save model. 
    :param save_name : checkpoint name to save with.
    :param erly_stop : Set to True to enable early stopping.
    :param start_EarlyStop_count_from_epoch : Integer value of epoch from which early_stop tracking will start for validation_loss.
                                              Default set to 0 to start tracking from the very beginning.
    :param patience : How long to wait after last time validation loss improved. Default 7.
    :param method: str. Optimization algorithm. Can take 'adam'/'sgd'.
    :param learning_rate: float. Controls the step size of each update, only for sgd and adam.
    :param weight_decay : float. Controls L2_regularization. Default set to 0.01.
    :param device: str. Name of the device to run the model. Default set to 'cuda'.
    :param verbose: boolean. If True, prints training progress statement.
    :param standardization : Set to False if want to normalize input variables between 0-1 instead of standardizing with mean & std.
    
    :returns: If standardization = True - A trained NN model, training loss, validation loss, training mean, and training standard deviation
              If standardization = False - A trained NN model, training loss, validation loss, and normalization scaler.
    """
    start_time = timeit.default_timer()

    # Call the requested optimizer method to train the weights.
    if method == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=weight_decay)
    elif method == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay, betas=(0.9, 0.999))
    else:
        raise Exception("method must be 'sgd', 'adam'")

    mse_func = torch.nn.MSELoss()  # mse function

    # to track the training loss as the model trains
    train_losses = []
    # to track the validation loss as the model trains
    valid_losses = []
    # to track the average training loss per epoch as the model trains
    avg_train_losses = []
    # to track the average validation loss per epoch as the model trains
    avg_valid_losses = [] 

    # initialize the early_stopping object
    early_stopping = EarlyStopping(save_folder, savename, patience=patience, verbose=True)

    for epoch in range(n_epochs):
        ###################
        # train the model #
        ###################
        model.train() # prep model for training

        training_loss = 0
        for features, target in train_dataloader:
            # reshaping target to have them in single column 
            target = target.reshape(-1, 1)  

            if standardization:
                # Standardization
                Xs, X_means, X_stds = standardize(features)

            else:
                # Normalization
                Xs, scaler = normalize(features)  # this is numpy array
                Xs = torch.tensor(Xs).float()  # converting normalized array to torch tensor again

            # moving features and target to cuda
            Xs = Xs.to(device)
            T = target.to(device)

            # Forward pass
            Xs.requires_grad_(True)
            Y = model(Xs)
            # calculating MSE Loss
            mse_loss = mse_func(Y, T)  
            # Backward pass
            mse_loss.backward()  
            # perform a single optimization step (parameter update)
            optimizer.step()
            # Reset the gradients to zero
            optimizer.zero_grad()

            # summing to training loss in an epoch
            training_loss += mse_loss.item()
            # Storing MSE Loss
            train_losses.append(mse_loss.item())

        ######################    
        # validate the model #
        ######################
        model.eval() # prep model for evaluation  
        validation_loss = 0
        with torch.no_grad(): # tells PyTorch not to compute gradients
            for features, target in validation_dataloader:
                # reshaping target to have them in single column 
                target = target.reshape(-1, 1)  

                if standardization:
                    # Standardization and moving to cuda (if device is cuda)
                    Xs = (features - X_means) /  X_stds
                else:
                    Xs = scaler.transform(features)  # this is numpy array
                    Xs = torch.tensor(Xs).float() # converting normalized array to torch tensor again

                # moving features and target to cuda
                Xs = Xs.to(device)
                T = target.to(device)

                # forward pass: compute predicted outputs by passing inputs to the model
                prediction = model(Xs)
                #calculate MSE loss
                mse_loss = mse_func(prediction, T) 
                # summing to validation loss in an epoch
                validation_loss += mse_loss.item()
                # Storing MSE Loss
                valid_losses.append(mse_loss.item())
    

        # Calculating mean MSE after each epoch
        mean_train_loss = training_loss / len(train_dataloader)
        avg_train_losses.append(mean_train_loss)

        mean_validation_loss = validation_loss / len(validation_dataloader)
        avg_valid_losses.append(mean_validation_loss)

        # Print performance on train and validation dataset
        print(f'avg. MSE Loss in epoch {epoch + 1}:',
              f'train loss: {mean_train_loss}', '|' ,f'validation loss: {mean_validation_loss}')


        # early_stopping needs the validation loss to check if it has decreased, 
        # and if it has, it will make a checkpoint of the current model
        if early_stop:
            if epoch == start_EarlyStop_count_from_epoch:
                early_stopping(mean_validation_loss, model)
                start_EarlyStop_count_from_epoch += 1  # increasing 'start_EarlyStop_count_from_epoch' value to keep tracking
                 
            if early_stopping.early_stop:
                print("Early stopping")
                break
        
        else: #save model if early_stop=False
            savepath = os.path.join(save_folder, savename + '.pt')
            torch.save(model.state_dict(), savepath)
            

    # saving model training time
    end_time = timeit.default_timer()
    runtime = (end_time-start_time)/60
    run_str = f'model training time {runtime} mins'
    print('model training time {:.3f} mins'.format(runtime))

    runtime_save = os.path.join(save_folder, savename+'_training_runtime.txt')
    with open(runtime_save, 'w') as file:
        file.write(run_str)


    if standardization:
        # **** model checkpoint was saved during model training*****
        # saving model results and data preparataion tools
        dump(avg_train_losses, open(os.path.join(save_folder, savename + '_trainloss.pkl'), 'wb'))
        dump(avg_valid_losses, open(os.path.join(save_folder, savename + '_validationloss.pkl'), 'wb'))
        dump(X_means, open(os.path.join(save_folder, savename + '_trainmean.pkl'), 'wb'))
        dump(X_stds, open(os.path.join(save_folder, savename + '_trainstd.pkl'), 'wb'))

        return model, avg_train_losses, avg_valid_losses, X_means, X_stds
    else:
        # **** model checkpoint was saved during model training*****
        # saving model results and data preparataion tools
        dump(avg_train_losses, open(os.path.join(save_folder, savename + '_trainloss.pkl'), 'wb'))
        dump(avg_valid_losses, open(os.path.join(save_folder, savename + '_validationloss.pkl'), 'wb'))
        dump(scaler, open(os.path.join(save_folder, savename + '_trainscaler.pkl'), 'wb'))
        return model, avg_train_losses, avg_valid_losses, scaler
    

def load_trained_model(model_initialized, model_path, train_loss_path, validation_loss_path, 
                       train_mean_path, train_std_path, standardization=True, train_scaler_path=None):
    """
    Load trained neural network model.
    
    : model_initialization : Initialized model from NeuralNetwork() class. 
                             Need to have the same n_inputs, n_hiddens_list, n_outputs.
    :param model_path : Filepath of saved trained neural network model.
    :param train_loss_path : Filepath of saved trained neural network training loss.
    :param validation_loss_path : Filepath of saved trained neural network validation loss.
    :param train_mean_path : Filepath of saved trained neural network input feature mean for feature standardization.
    :param train_std_path : Filepath of saved trained neural network input feature std. deviation for feature standardization.
    :param standardization : Set to True if data was standardized during training. 
                             If data was normalized during training, set to False and define a 
                             train_scaler_path set (also set train_mean_path/train_std_path to None)
    :param train_scaler_path : If standardization=False, set a train_scaler_path to load it.
    
    :returns: If standardization = True - A trained NN model, training loss, validation loss, training mean, and training standard deviation
              If standardization = False - A trained NN model, training loss, validation loss, and normalization scaler.
    """
    
    # Loading trained model
    loaded_model = model_initialized
    loaded_model.load_state_dict(torch.load(model_path))
    loaded_model.eval()  # taking loaded model to evaluation mode for making prediction
    
    print('*****Trained model loaded and evaluation mode activated*****')
    
    if standardization:
        # loading model results and data preparataion tools
        avg_train_losses = load(open(train_loss_path, 'rb'))
        avg_valid_losses = load(open(validation_loss_path, 'rb'))
        train_mean = load(open(train_mean_path, 'rb'))
        train_std = load(open(train_std_path, 'rb'))
        
        return loaded_model, avg_train_losses, avg_valid_losses, train_mean, train_std
        
    else: # in case of normalization
        if train_scaler_path is None:
            raise Exception('##### Set a train_scaler_path. train_mean_path & std_mean_path have to be None #####')
        # loading model results and data preparataion tools
        avg_train_losses = load(open(train_loss_path, 'rb'))
        avg_valid_losses = load(open(validation_loss_path, 'rb'))
        train_scaler = load(open(train_scaler_path, 'rb'))

        return loaded_model, avg_train_losses, avg_valid_losses, train_scaler

    
def predict(trained_nn_model, input_df, mean, std, standardize=True, normalize_scaler=None, device='cuda'):
    """
    Make prediction using the trained model.
    
    params:
    trained_nn_model : Trained Neural Network Model instance.
    input_df : Input dataframe.
    mean : Training mean from the model training process used to scale the data.
    std : Training std from the model training process used to scale the data.
    standardize : Set to True (default) to standardize. Setting False will normalize the data.
    normalize_scaler : Scaler from model training to use in data normalization. 
                       Only needed if standardize=False. Default set to None.
    device : Default set to 'cuda'.
    
    returns: A tensor of model prediction.
    """
    # converting dataframe to torch tensor
    input_df = reindex_df(input_df)
    input_tensor = torch.tensor(np.array(input_df)).float()
    
    if standardize: # standardization
        input_tensor = (input_tensor - mean) /  std
    else: # Normalization
        input_array = normalize_scaler.transform(input_tensor)  # this is numpy array
        input_tensor = torch.tensor(input_array).float() # converting normalized array to torch tensor again
    
    input_tensor = input_tensor.to(device) # moving to gpu in device='cuda'
    
    # prediction and moving to cpu again
    prediction = trained_nn_model(input_tensor).cpu().detach()  
    print('Model predeiction (type: torch tensor) generated and moved to cpu', '\n')
    
    return prediction

In [None]:
def run_DL_model_to_generate_prediction(trained_dl_model, x_train, predictor_dataset, ref_raster,
                                        standardization_mean, standardization_std,
                                        output_folder, variable_name_keyword, remove_neg_values=False):
    """
    Uses trained DL model to generate prediicted daily raster.
    
    params:
    trained_ml_model : A trained ML model object. This will come from train_model() function.
    x_train : x_train dataframe generated by split_train_val_test_set() function.
    predictor_dataset : Filepath of parquet file of predictor dataset.
    ref_raster : Filepath of a reference raster that will be used to rasterize. 
                 Use any TWC raster data as that is the target resolution.
    standardization_mean : Training mean from the model training process used to scale the data.
    standardization_std : Training std from the model training process used to scale the data.
    output_folder : Output folder filepath to save predeicted daily raster.
    variable_name_keyword : a keyword (str) that will be used to save predicted raster. For example 'total_precip'. 
                            Date will be added automatically from era5 dataset.
                            
    returns: None.
    """
    
    makedirs([output_folder])
    columns_trained_with = x_train.columns.tolist()
    
    era5_df = pd.read_parquet(predictor_dataset)
    
    era5_df['year'] = era5_df['date'].apply(lambda x: int(str(x)[0:4]))
    era5_df['month'] = era5_df['date'].apply(lambda x: int(str(x)[4:6]))
    era5_df['day'] = era5_df['date'].apply(lambda x: int(str(x)[6:8]))
    
    
    # renaming total_precip column to total_precip_era5 as in  x_train its named as total_precip_era5
    if 'total_precip' in list(era5_df.columns):
        era5_df = era5_df.rename(columns={'total_precip': 'total_precip_era5'})
    
    # selecting columns for which model was trained with using xtrain columns
    selected_era5_df = era5_df[columns_trained_with]
    selected_era5_df = reindex_df(selected_era5_df)
    
    # generating model prediction for all dates in era5 dataframe
    prediction_arr = predict(trained_nn_model=trained_dl_model, input_df=selected_era5_df, 
                             mean=standardization_mean, std=standardization_std, 
                             standardize=True, normalize_scaler=None, device='cuda')
    prediction_arr = prediction_arr.numpy()
    
    # sometimes model can predict values less than zero when it can't happen (for example precipitation)
    # removing that
    if remove_neg_values:
        prediction_arr = np.where(prediction_arr<0, 0, prediction_arr)
    
    print('Model prediction converted to numpy array')
    
    # Attaching date+lat+lon info with the predicted high resolution precipitation data
    era5_dates_lat_lon = era5_df[['date', 'lat', 'lon']].reset_index()
    prediction_df = pd.DataFrame(prediction_arr, columns=['high res. prediction'])
    prediction_df = era5_dates_lat_lon.join(prediction_df, on='index')
    
    # creating prediction raster for each day
    unique_dates = list(np.unique(era5_df['date']))
    print('Generating model interpolated daily rasters...')
    
    for date in unique_dates:
        pred_df_1day = prediction_df[prediction_df['date']==date] # prediction for single day
        
        # converting to geodataframe
        pred_1day_gdf = gpd.GeoDataFrame(pred_df_1day, 
                                         geometry=gpd.points_from_xy(pred_df_1day.lon, pred_df_1day.lat))
        
        raster_name = f'{variable_name_keyword}_{date}.tif'
        output_raster = os.path.join(output_folder, raster_name)
        rasterize_shapefile(input_file=pred_1day_gdf, output_raster=output_raster, attribute='high res. prediction', 
                            ref_raster=ref_raster, date=None, grid_shapefile=None, 
                            merge_alg = MergeAlg.replace, dtype='float32', no_data_value=-9999, paste_on_ref_raster=True)
    print('All daily rasters generated')