### Author: Md Fahim Hasan
### Work Email: mdfahim.hasan@bayer.com

In [None]:
import os
import joblib
import timeit
import numpy as np
import pandas as pd
import seaborn as sns
from pprint import pprint
import dask.dataframe as ddf
import matplotlib.pyplot as plt
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.inspection import PartialDependenceDisplay
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold, RepeatedKFold

from ipynb.fs.full.general_utils import *

# Read me

__This scripts consist of functions required for building, tuning, analyzing machine learning models. Currently, two machine learning models (Random Forest and LGBM can be implemented by this scripts. The functions can be easily modified to incorporate model models into the script.__

In [3]:
def reindex_df(df):
    """
    Reindex dataframe based on column names.
    Parameters:
    df : Pandas dataframe.
    Returns: Reindexed dataframe.
    """
    sorted_columns = sorted(df.columns)
    df = df.reindex(sorted_columns, axis=1)

    return df


def split_train_val_test_set(input_dataset_fp, pred_attr, exclude_columns, test_perc=0.3, validation_perc=0.3, random_state=0,
                             outdir=None, verbose=True):
    """
    Split dataset into train, validation, and test data based on a train/test/validation ratio.
    
    parameters:
    input_dataset_fp : Input csv/parquet file (with filepath) containing all the predictors.
    pred_attr : Variable name which will be predicted. Defaults to 'Subsidence'.
    exclude_columns : Tuple of columns that will not be included in training the fitted_model.
    test_perc : The percentage of test dataset. Defaults to 0.3.
    validation_perc : The percentage of validation dataset. Defaults to 0.3.
    random_state : Seed value. Defaults to 0.
    output_dir : Set a output directory if training and test dataset need to be saved. Defaults to None.
    verbose : Set to True if want to print which columns are being dropped and which will be included in the model.
    
    returns: X_train, X_val, X_test, y_train, y_val, y_test arrays.
    """
    if '.csv' in input_dataset_fp:
        input_df = pd.read_csv(input_dataset_fp)
    elif '.parquet' in input_dataset_fp:
        input_df = pd.read_parquet(input_dataset_fp)
    elif isinstance(input_dataset_fp, pd.DataFrame):
        input_df = input_dataset_fp
    else:
        raise Exception('input_dataset_fp must be a .csv/.parquet file or pandas dataframe')

        
    drop_columns = exclude_columns + [pred_attr]  # dropping unwanted columns/colums that will not be used in model training
    x = input_df.drop(columns=drop_columns)
    y = input_df[pred_attr]
    
    # Reindexing for ensuring that columns go into the model in same serial every time
    x = reindex_df(x)
    
    if verbose:
        print('Dropping Columns-', exclude_columns, '\n')
        print('Predictors:', x.columns)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_perc, random_state=random_state,
                                                        shuffle=True)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=validation_perc, random_state=random_state,
                                                        shuffle=True)

    if outdir:
        makedirs([outdir])
        x_train_df = pd.DataFrame(x_train)
        x_train_df.to_csv(os.path.join(outdir, 'x_train.csv'), index=False)

        y_train_df = pd.DataFrame(y_train)
        y_train_df.to_csv(os.path.join(outdir, 'y_train.csv'), index=False)

        x_val_df = pd.DataFrame(x_val)
        x_val_df.to_csv(os.path.join(outdir, 'x_val.csv'), index=False)

        y_val_df = pd.DataFrame(y_val)
        y_val_df.to_csv(os.path.join(outdir, 'y_val.csv'), index=False)
        
        x_test_df = pd.DataFrame(x_test)
        x_test_df.to_csv(os.path.join(outdir, 'x_test.csv'), index=False)

        y_test_df = pd.DataFrame(y_test)
        y_test_df.to_csv(os.path.join(outdir, 'y_test.csv'), index=False)

    return x_train, x_val, x_test, y_train, y_val, y_test


def split_train_val_test_set_by_date(input_dataset_fp, pred_attr, exclude_columns, test_perc=0.3, 
                                     validation_perc=0, random_state=0,
                                     outdir=None, verbose=True):
    """
    Split dataset into train, validation, and test data based on a ratio of dates.
    
    parameters:
    input_dataset_fp : Input csv/parquet file (with filepath) containing all the predictors.
    pred_attr : Variable name which will be predicted. Defaults to 'Subsidence'.
    exclude_columns : Tuple of columns that will not be included in training the fitted_model.
    test_perc : The percentage of test dataset. Defaults to 0.3.
    validation_perc : The percentage of validation dataset. Default set to 0 (for no validation data).
    random_state : Seed value. Defaults to 0.
    output_dir : Set a output directory if training and test dataset need to be saved. Defaults to None.
    verbose : Set to True if want to print which columns are being dropped and which will be included in the model.
    
    returns: X_train, X_val, X_test, y_train, y_val, y_test arrays and list of train_dates, validation_dates, and test_dates.
    """
    if '.csv' in input_dataset_fp:
        input_df = pd.read_csv(input_dataset_fp)
    elif '.parquet' in input_dataset_fp:
        input_df = pd.read_parquet(input_dataset_fp)
    elif isinstance(input_dataset_fp, pd.DataFrame):
        input_df = input_dataset_fp
    else:
        raise Exception('input_dataset_fp must be a .csv/.parquet file or pandas dataframe')

    if exclude_columns is not None:
        # dropping unwanted columns/colums that will not be used in model training
        input_df = input_df.drop(columns=exclude_columns)  # should not drop date column
    
    # Permuting dates randomly and calculating train/test/validation length
    unique_dates = input_df['date'].unique()
    np.random.seed(0)  # setting a seed for same split every time
    unique_dates = list(np.random.permutation(unique_dates)) 
    
    train_length = round(len(unique_dates) * (1-test_perc-validation_perc))
    test_length = round(len(unique_dates) * test_perc)
    validation_length = round(len(unique_dates) * validation_perc)
    
    # seleting train/test/validation dataset based on randomly permuted dates 
    train_dates = unique_dates[:train_length]
    test_dates = unique_dates[train_length: (train_length+test_length)]
    validation_dates = unique_dates[(train_length+test_length): (train_length+test_length+validation_length)]
    
    train_df = input_df[input_df['date'].isin(train_dates)]
    test_df = input_df[input_df['date'].isin(test_dates)]
    validation_df = input_df[input_df['date'].isin(validation_dates)]
    
    x_train = reindex_df(train_df.drop(columns=['date', pred_attr]))
    x_test = reindex_df(test_df.drop(columns=['date', pred_attr]))
    x_val = reindex_df(validation_df.drop(columns=['date', pred_attr]))
    
    y_train = train_df[pred_attr]
    y_test = test_df[pred_attr]
    y_val = validation_df[pred_attr]
    
    
    if verbose:
        print('Dropping Columns-', exclude_columns, '\n')
        print('Predictors:', x_train.columns)

    if outdir:
        makedirs([outdir])
        x_train_df = pd.DataFrame(x_train)
        x_train_df.to_csv(os.path.join(outdir, 'x_train.csv'), index=False)

        y_train_df = pd.DataFrame(y_train)
        y_train_df.to_csv(os.path.join(outdir, 'y_train.csv'), index=False)

        x_val_df = pd.DataFrame(x_val)
        x_val_df.to_csv(os.path.join(outdir, 'x_val.csv'), index=False)

        y_val_df = pd.DataFrame(y_val)
        y_val_df.to_csv(os.path.join(outdir, 'y_val.csv'), index=False)
        
        x_test_df = pd.DataFrame(x_test)
        x_test_df.to_csv(os.path.join(outdir, 'x_test.csv'), index=False)

        y_test_df = pd.DataFrame(y_test)
        y_test_df.to_csv(os.path.join(outdir, 'y_test.csv'), index=False)

    return x_train, x_val, x_test, y_train, y_val, y_test, train_dates, validation_dates, test_dates


def calculate_plot_mutual_information(x_train, y_train, exclude_columns=None):
    """
    Calculate and plot mutual information (MI) betwwen predictor and trainig data.
    
    params:
    x_train : x_train dataframe/csv/parquet.
    y_train : y_train dataframe/csv/parquet.
    exculde_columns : List of columns to exclude from x_train
    
    returns: a series holding mutual information (MI) score of the predictors. Plot of MI score of the predictors.
    """
    if '.csv' in x_train:
        x_train_df = pd.read_csv(x_train)
        y_train_df = pd.read_csv(y_train)
    
    elif '.parquet' in x_train:
        x_train_df = pd.read_parquetv(x_train)
        y_train_df = pd.read_parquet(y_train)
    else:
        x_train_df = x_train
        y_train_df = y_train    
    
    if exclude_columns is not None:
        x_train_df = x_train_df.drop(columns=exclude_columns)
        
    mutual_info = mutual_info_regression(x_train_df, y_train_df, random_state=0)
    mutual_info = pd.Series(mutual_info)
    mutual_info.index = x_train_df.columns
    
    mutual_info = mutual_info.sort_values(ascending=False)
    
    mutual_info.plot.bar()
    
    return mutual_info


def tune_hyperparameter(x, y, model='rf', n_folds=5, repeated_Kfold=False, n_repeats=5, 
                         random_search=True, n_iter=50, n_jobs=-1):
    """
    Hyperparameter optimization using RandomizedSearchCV/GridSearchCV.
    
    *****
    good resources for building LGBM model
    
    https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
    https://lightgbm.readthedocs.io/en/latest/Parameters.html
    https://neptune.ai/blog/lightgbm-parameters-guide
    *****

    Parameters:
    x_val, y_val : x_val (predictor) and y_val (target) arrays from split_train_test_ratio function.
    model : Model for which hyperparameters will be tuned. Can only tune hyperparameters for RF regressor now. 
            Default set to 'rf'.
    n_folds : Number of folds in K Fold CV. Default set to 5.
    repeated_Kfold : Set to True if want to perform repeated Kfold. If False (default), will run for KFold.
    n_repeats : If repeated_Kfold is True, number of repeats. Default set to 5.
    random_search : Set to False if want to perform GridSearchCV. Default set to True to perform RandomizedSearchCV.
    n_iter : Number of parameter combinations to be tested in RandomizedSearchCV if random_search is True.
    n_jobs (rf/gbdt param): The number of jobs to run in parallel. Defaults to -1(using all processors).
    
    Returns : Optimized Hyperparameters.
    """
    global regressor
    # creating parameter dictionary
    # hyperparameters are optimized from param_to_optimize_dict 
    # if hyperparamter optimization is off/not needed uses paramters from default_params_dict
    # ******* after hyperparamter optimization, assign optimized values to default_params_dict***************
    param_to_optimize_dict = {'rf': {'n_estimators': [100, 200, 300, 400, 500],
                                     'max_depth': [7, 10, 15, 20],
                                     'max_features': [6, 7, 10, 'log2'],
                                     'min_samples_leaf': [5e-4, 1e-5, 1e-3, 6, 12, 20, 25],
                                     'min_samples_split': [6, 7, 8, 10],
                                     'max_samples': [None, 0.9, 0.8, 0.7]
                                 },
                          'lgbm': {'n_estimators': [100, 200, 250],
                                   'max_depth': [7, 10, 13],
                                   'learning_rate': [0.01, 0.05],
                                   'subsample': [0.8, 0.7, 0.6],
                                   'colsample_bytree': [0.8, 0.7],
                                   'colsample_bynode': [0.8, 0.7],
                                   'path_smooth': [0.1, 0.2, 0.3],
                                   'num_leaves': [30, 50, 70],
                                   'min_child_samples': [20, 25, 40],
#                                    'data_sample_strategy' : ['goss']
                                   }
                          }
    param_dict = param_to_optimize_dict
    print('Model Name:', model)
    pprint(param_dict[model])
    
    
    # creating model structures
    if model == 'rf':
        regressor = RandomForestRegressor(random_state=0, n_jobs=n_jobs, bootstrap=True, oob_score=True)
    
    elif model == 'lgbm':
        # the boosting_type has been set to 'goss' for faster training. Can use 'gdbt'/'dart'. Change params_dict accordingly
        regressor = LGBMRegressor(tree_learner='serial', random_state=0, 
                                  deterministic=True, force_row_wise=True, n_jobs=n_jobs)
    else:
        raise Exception("model should be 'rf'/'lgbm'. Other types are not supported currently")
    
    
    scoring_metrics = ['r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error']
    
    # Hyperparameter optimization block
    # KFold or repeated KFold
    if repeated_Kfold:
        kfold = RepeatedKFold(n_splits=n_folds, n_repeats=n_repeats, random_state=0)
    else:
        kfold = KFold(n_splits=n_folds, shuffle=True, random_state=0)

    # Random search or grid search
    if random_search:
        fitted_model = RandomizedSearchCV(estimator=regressor, param_distributions=param_dict[model], n_iter=n_iter,
                                cv=kfold, verbose=1, random_state=0, n_jobs=n_jobs,
                                scoring=scoring_metrics, refit=scoring_metrics[1], return_train_score=True)
    else:
        fitted_model = GridSearchCV(estimator=regressor, param_grid=param_dict[model], cv=kfold, verbose=1, n_jobs=n_jobs,
                          scoring=scoring_metrics, refit=scoring_metrics[1], return_train_score=True)

    fitted_model.fit(x, y)  #  this will be x_val and y_val if tune_hyperparameter=True


    print('\n')
    print('best parameters for RMSE value', '\n')
    pprint(fitted_model.best_params_)
    print('\n')
    print('Train Results....')
    best_rmse = fitted_model.cv_results_['mean_train_neg_root_mean_squared_error'][fitted_model.best_index_]
    best_r2 = fitted_model.cv_results_['mean_train_r2'][fitted_model.best_index_]
    best_MAE = fitted_model.cv_results_['mean_train_neg_mean_absolute_error'][fitted_model.best_index_]
    print('Best tuning-train RMSE: {:.3f}'.format(best_rmse))
    print('Best tuning-train R2: {:.3f}'.format(best_r2))
    print('Best tuning-train MAE: {:.3f}'.format(best_MAE))

    print('\n')
    print('Test Results....')
    best_rmse = fitted_model.cv_results_['mean_test_neg_root_mean_squared_error'][fitted_model.best_index_]
    best_r2 = fitted_model.cv_results_['mean_test_r2'][fitted_model.best_index_]
    best_MAE = fitted_model.cv_results_['mean_test_neg_mean_absolute_error'][fitted_model.best_index_]
    print('Best tuning-test RMSE: {:.3f}'.format(best_rmse))
    print('Best tuning-test R2: {:.3f}'.format(best_r2))
    print('Best tuning-test MAE: {:.3f}'.format(best_MAE))

    if model == 'rf':
        param_dict = {'n_estimators': fitted_model.best_params_['n_estimators'],
                      'max_depth': fitted_model.best_params_['max_depth'],
                      'max_features': fitted_model.best_params_['max_features'],
                      'min_samples_leaf': fitted_model.best_params_['min_samples_leaf'],
                      'min_samples_split': fitted_model.best_params_['min_samples_split'],
                      'max_samples': fitted_model.best_params_['max_samples']
                     }
    elif model == 'lgbm':
        param_dict = {'n_estimators': fitted_model.best_params_['n_estimators'],
                      'max_depth': fitted_model.best_params_['max_depth'],
                      'learning_rate': fitted_model.best_params_['learning_rate'],
                      'subsample': fitted_model.best_params_['subsample'],
                      'colsample_bytree': fitted_model.best_params_['colsample_bytree'],
                      'colsample_bynode': fitted_model.best_params_['colsample_bynode'],
                      'path_smooth': fitted_model.best_params_['path_smooth'],
                      'num_leaves': fitted_model.best_params_['num_leaves'],
                      'min_child_samples': fitted_model.best_params_['min_child_samples'],
#                       'data_sample_strategy' : fitted_model.best_params_['data_sample_strategy']
                     }
        
    return param_dict


def train_model(x_train, y_train, params_dict, model='rf', n_jobs=-1,
                load_model=False, save_model=False, save_folder=None, save_name=None,
                tune_hyperparameters=False, repeated_Kfold=False, n_folds=5, n_iter=10, n_repeats=5):
    """
    Train a Random Forest Regressor model with given hyperparameters. 

    
    *******
    # To run the model without saving/loading the trained model, use load_model=False, save_model=False, save_folder=None, 
        save_name=None.
    # To run the model and save it without loading any trained model, use load_model=False, save_model=True, 
        save_folder='give a folder path', save_name='give a name'.
    # To load a pretrained model without running a new model, use load_model=True, save_model=False, 
        save_folder='give the saved folder path', save_name='give the saved name'.
    *******

    
    params: 
    x_train, y_train : x_train (predictor) and y_train (target) arrays from split_train_test_ratio function.
    model : str of type of model. The code can only run random forest regession model. Default set to 'rf'.
    params_dict : ML model param dictionary. Currently supports 'random forest (RF)' and 'LGBM (lgbm)' Goss.
                  **** when tuning hyperparameters set params_dict=None.
                  For RF the dictionary should be like the folowing with user defined values- 
                    param_dict = {'n_estimators': 200,
                                  'max_depth': 8,
                                  'max_features': 'log2',
                                  'min_samples_leaf': 6,
                                  'min_samples_split': 4,
                                  'max_samples': None
                                 }
                For LGBM the dictionary shoudl be like the folowing with user defined values- 
                    param_dict = {'n_estimators': 250,
                                  'max_depth': 13,
                                  'learning_rate': 0.05,
                                  'subsample': 0.7,
                                  'colsample_bytree': 0.8,
                                  'colsample_bynode': 0.7 ,
                                  'path_smooth': 0.2,
                                  'num_leaves': 70,
                                  'min_child_samples': 40,
                                  'data_sample_strategy' : 'goss'
                                  }
                                 
    n_jobs (rf/lgbm param): The number of jobs to run in parallel. Default set to to -1 (using all processors).
    load_model : Set to True if want to load saved model. Default set to False.
    save_model : Set to True if want to save model. Default set to False.
    save_folder : Filepath of folder to save model. Default set to None for save_model=False..
    save_name : Model's name to save with. Default set to None for save_model=False.
    
    returns: trained RF regression model.
    """
    if not load_model:
        start_time = timeit.default_timer()
        if tune_hyperparameters:
            params_dict= tune_hyperparameter(x=x_train, y=y_train, model=model, 
                                             n_folds=n_folds, repeated_Kfold=repeated_Kfold, 
                                             n_repeats=n_repeats, n_iter=n_iter,
                                             random_search=True, n_jobs=n_jobs)
        
        if model == 'rf':
            n_estimators = params_dict['n_estimators']
            max_depth = params_dict['max_depth']
            max_features = params_dict['max_features']
            min_samples_leaf = params_dict['min_samples_leaf']
            min_samples_split = params_dict['min_samples_split']
            max_samples = params_dict['max_samples']
            regressor_model = RandomForestRegressor(n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, 
                                                    min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split,
                                                    max_samples=max_samples, random_state=0, n_jobs=n_jobs, bootstrap=True, 
                                                    oob_score=True)
        elif model == 'lgbm':
            n_estimators = params_dict['n_estimators']
            max_depth = params_dict['max_depth']
            learning_rate = params_dict['learning_rate']
            subsample = params_dict['subsample']
            colsample_bytree = params_dict['colsample_bytree']
            colsample_bynode = params_dict['colsample_bynode']
            path_smooth = params_dict['path_smooth']
            num_leaves = params_dict['num_leaves']
            min_child_samples = params_dict['min_child_samples']
            
            data_sample_strategy = 'goss'  # using 'goss' by default here as we are using 'GOSS' Gradient boosting methods
            
            # Configuring the regressor with the parameters
            regressor_model = LGBMRegressor(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate,
                                            subsample=subsample, colsample_bytree=colsample_bytree, 
                                            colsample_bynode=colsample_bynode, path_smooth=path_smooth, num_leaves=num_leaves,
                                            min_child_samples=min_child_samples, data_sample_strategy=data_sample_strategy,
                                            tree_learner='serial', random_state=0, 
                                            deterministic=True, force_row_wise=True, n_jobs=n_jobs)

        trained_model = regressor_model.fit(x_train, y_train)
        y_pred = trained_model.predict(x_train)
        
        print('Train RMSE = {:.3f}'.format(calculate_rmse(Y_pred=y_pred, Y_obsv=y_train)))
        print('Train R2 = {:.3f}'.format(calculate_r2(Y_pred=y_pred, Y_obsv=y_train)))

        if save_model:
            makedirs([save_folder])
            if '.joblib' not in save_name:
                model_save_name =  save_name + '.joblib'
            save_path = os.path.join(save_folder, model_save_name)
            joblib.dump(trained_model, save_path, compress=3)
        
        # printing and saving runtime
        end_time = timeit.default_timer()
        runtime = (end_time-start_time)/60
        run_str = f'model training time {runtime} mins'
        print('model training time {:.3f} mins'.format(runtime))
        
        if tune_hyperparameters: # saving hyperparameter tuning + model training time
            runtime_save = os.path.join(save_folder, save_name+'_tuning_training_runtime.txt')
            with open(runtime_save, 'w') as file:
                file.write(run_str)
        else: # saving model training time with given parameters
            runtime_save = os.path.join(save_folder, save_name+'_training_runtime.txt')
            with open(runtime_save, 'w') as file:
                file.write(run_str)
        
    else:
        if '.joblib' not in save_name:
            save_name =  save_name + '.joblib'
        saved_model_path = os.path.join(save_folder, save_name)
        trained_model = joblib.load(saved_model_path)
        
    return trained_model


def run_model_to_generate_prediction(trained_ml_model, x_train, predictor_era5_dataset, ref_raster, 
                                     output_folder, variable_name_keyword):
    """
    Uses trained ML model to generate prediicted daily raster.
    
    params:
    trained_ml_model : A trained ML model object. This will come from train_model() function.
    x_train : x_train dataframe generated by split_train_val_test_set() function.
    predictor_era5_dataset : Filepath of parquet file of ERA5 dataset.
    ref_raster : Filepath of a reference raster that will be used to rasterize. 
                 Use any TWC raster data as that is the target resolution. 
    output_folder : Output folder filepath to save predeicted daily raster.
    variable_name_keyword : a keyword (str) that will be used to save predicted raster. For example 'total_precip'. 
                            Date will be added automatically from era5 dataset.
                            
    returns: None.
    """
    
    makedirs([output_folder])
    columns_trained_with = x_train.columns.tolist()
    
    era5_df = pd.read_parquet(predictor_era5_dataset)
    era5_df['year'] = era5_df['date'].apply(lambda x: int(str(x)[0:4]))
    era5_df['month'] = era5_df['date'].apply(lambda x: int(str(x)[4:6]))
    era5_df['day'] = era5_df['date'].apply(lambda x: int(str(x)[6:8]))

    era5_rename_dict = {'max_temp': 'max_temp_era5',  
                        'avg_Rhumid': 'avg_Rhumid_era5', 
                        'min_temp': 'min_temp_era5', 
                        'avg_wind_speed': 'avg_wind_speed_era5'}
    era5_df = era5_df.rename(columns=era5_rename_dict)
    
    # selecting columns for which model was trained with using xtrain columns
    selected_era5_df = era5_df[columns_trained_with]
    selected_era5_df = reindex_df(selected_era5_df)
    
    prediction_arr= trained_ml_model.predict(selected_era5_df)
    
    # Attaching date+lat+lon info with the predicted high resolution precipitation data
    era5_dates_lat_lon = era5_df[['date', 'lat', 'lon']].reset_index()
    prediction_df = pd.DataFrame(prediction_arr, columns=['high res. prediction'])
    prediction_df = era5_dates_lat_lon.join(prediction_df, on='index')
    
    # creating prediction raster for each day
    unique_dates = list(np.unique(era5_df['date']))
    print('Generating model interpolated daily rasters...')
    
    for date in unique_dates:
        pred_df_1day = prediction_df[prediction_df['date']==date] # prediction for single day
        
        # converting to geodataframe
        pred_1day_gdf = gpd.GeoDataFrame(pred_df_1day, 
                                         geometry=gpd.points_from_xy(pred_df_1day.lon, pred_df_1day.lat))
        
        raster_name = f'{variable_name_keyword}_{date}.tif'
        output_raster = os.path.join(output_folder, raster_name)
        rasterize_shapefile(input_file=pred_1day_gdf, output_raster=output_raster, attribute='high res. prediction', 
                            ref_raster=ref_raster, date=None, grid_shapefile=None, 
                            merge_alg = MergeAlg.replace, dtype='float32', no_data_value=-9999, paste_on_ref_raster=True)
    print('All daily rasters generated')
        
        
def plot_predictor_importance(trained_model, x_train, outdir=None, predictor_imp_keyword='rf'):
    x_train_df = pd.DataFrame(x_train)
    col_labels = np.array(x_train_df.columns)
    
    importance = np.array(trained_model.feature_importances_)
    imp_dict = {'feature_names': col_labels, 'feature_importance': importance}
    imp_df = pd.DataFrame(imp_dict)
    imp_df.sort_values(by=['feature_importance'], ascending=False, inplace=True)
    
    plt.figure(figsize=(8, 6))
    plt.rcParams['font.size'] = 14
    sns.barplot(x=imp_df['feature_names'], y=imp_df['feature_importance'], palette='rocket')
    plt.xticks(rotation=90)
    plt.ylabel('Variable Importance')
    plt.xlabel('Variables')
    plt.tight_layout()
    if outdir is not None:
        savepath = os.path.join(outdir, predictor_imp_keyword + '_pred_importance.png')
        plt.savefig(savepath, dpi=600)
        print('Feature importance plot saved')   

## Error Metrices

In [4]:
def calculate_r2(Y_pred, Y_obsv):
    """
    Calculates R2 value of model prediction vs observed data.

    :param Y_pred: prediction array or panda series object.
    :param Y_obsv: observed array or panda series object.

    :return: R2 value.
    """
    if isinstance(Y_pred, np.ndarray):
        Y_pred = pd.Series(Y_pred)
        r2_val = r2_score(Y_obsv, Y_pred)
    else:  # in case of pandas series
        r2_val = r2_score(Y_obsv, Y_pred)
    return r2_val


def calculate_rmse(Y_pred, Y_obsv):
    """
    Calculates RMSE value of model prediction vs observed data.

    :param Y_pred: prediction array or panda series object.
    :param Y_obsv: observed array or panda series object.

    :return: RMSE value.
    """
    if isinstance(Y_pred, np.ndarray):
        Y_pred = pd.Series(Y_pred)
        rmse_val = mean_squared_error(y_true=Y_obsv, y_pred=Y_pred, squared=False)
    else:  # in case of pandas series
        rmse_val = mean_squared_error(y_true=Y_obsv, y_pred=Y_pred, squared=False)
    return rmse_val


def scatter_plot(Y_pred, Y_obsv, xlabel, ylabel, title=None, savedir=None, plot_name=None, alpha=0.5, color_format='o',
                 axis_lim=None):
    """
    Makes scatter plot of model prediction vs observed data.

    :param Y_pred: flattened prediction array.
    :param Y_obsv: flattened observed array.
    :param savedir: filepath to save the plot.
    :param plot_name: plot name to save with. Default set to None.
    :param alpha: plot/scatter dots transparency level.
    :param color_format: Color and plot type format. For example, for 'bo' 'b' means blue color and 'o' means dot plot.
    :param axis_lim: A list of minimum and maximum values of x and y axis. 
                     Default set to None (will calculate and set xlim, ylim itself)
    
    :return: A scatter plot of model prediction vs observed data.
    """
    if isinstance(Y_pred, np.ndarray):
        Y_pred = pd.Series(Y_pred)
    
    min_value = min(Y_pred.min(), Y_obsv.min())
    max_value = max(Y_pred.max(), Y_obsv.max())
    
    plt.rcParams.update({'font.size': 16})
    fig, ax = plt.subplots(figsize=(8, 6))
    ax.plot(Y_obsv, Y_pred, color_format, alpha=alpha)
    ax.plot([0, 1], [0, 1], '-r', transform=ax.transAxes)
    ax.set_xlabel(xlabel) # 'Observed'
    ax.set_ylabel(ylabel) # 'Predicted'
    
    # setting x and y axis maximum and minimum value
    if axis_lim:
        ax.set_xlim(axis_lim)
        ax.set_ylim(axis_lim)
    else:    
        ax.set_xlim([min_value, max_value])
        ax.set_ylim([min_value, max_value])
    
    if title is not None:
        ax.set_title(title)

    r2_val = calculate_r2(Y_pred, Y_obsv)
    r2_str = 'R2: {:.3f}'.format(r2_val)
    ax.text(0.1, 0.9, s=r2_str, transform=ax.transAxes)

    
    if savedir is not None:
        makedirs([savedir])
        fig_loc = os.path.join(savedir, plot_name)
        fig.savefig(fig_loc, dpi=300)
        
        
def result_calc_test_dataset(trained_ml_model, list_test_dates, x_train, combined_dataset, era5_target_variable, 
                             twc_target_variable, output_csv):
    """
    Calculate ML model performance on test dataset for individual dates.
    
    params:
    trained_ml_model : Trained ml model.
    list_test_dates : List of test dates.
    x_train : x_train dataframe. 
    combined_dataset : Filepath of TWC and ERA5 cobined datase or the loaded dataframe.
    era5_target_variable : target variable name from ERA5 dataset.
    twc_target_variable : target variable name from TWC dataset.
    output_csv : Filepath of output csv.
    
    
    returns: Test results dataframe with before and after ML R2 values.
    """
    if isinstance(combined_dataset, pd.DataFrame):
        combined_df = combined_dataset
    else:
        combined_df = pd.read_parquet(combined_dataset)
    
    before_ML_R2 = []
    after_ML_R2 = []
    for date in list_test_dates:
        # before ML
        df_for_date = combined_df[combined_df.date==date]
        era5_target_val = df_for_date[era5_target_variable]
        twc_target_val = df_for_date[[twc_target_variable]].values
        
        r2_before = calculate_r2(Y_pred=era5_target_val, Y_obsv=twc_target_val)
        before_ML_R2.append(r2_before)
        
        # after ML
        ml_df = df_for_date[x_train.columns]
        ml_df = reindex_df(ml_df)
        
        y_pred = trained_ml_model.predict(ml_df)
        r2_after = calculate_r2(Y_pred=y_pred, Y_obsv=twc_target_val)
        after_ML_R2.append(r2_after)

    test_results = pd.DataFrame({'date': list_test_dates, 'before_ML_R2': before_ML_R2, 'after_ML_R2': after_ML_R2})
    test_results.to_csv(output_csv)
    
    # prining results
    print(f'{len(list_test_dates)} number of total dates in test dataset')
    print(f'{len(test_results[test_results.after_ML_R2>0.6])} days have R2 > 0.6')
    print(f'{len(test_results[test_results.after_ML_R2>0.5])} days have R2 > 0.5')
    print(f'{len(test_results[test_results.after_ML_R2>0.3])} days have R2 > 0.3')
    
    return test_results      