# 0. Libraries and importing the data
In this case, we will only import the training data, as this step in the machine learning pipeline involves feature selection and hyperparameter tuning.

In [1]:
import pandas as pd
import numpy as np

# Graphical tools
import plotly.express as px

# Hyperparameter tuning and cross-validation
from sklearn.model_selection import RandomizedSearchCV

# For creating a pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline

# For preprocessing
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier

# Classes for dealing with imbalanced datasets
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

# Utilities
from scipy.stats import uniform, randint, loguniform
import pickle
import os

## Preprocessed dataset version 3

In [None]:
csv_path = os.path.join("preprocessed_datasets", "06_train_dataset_v3.csv")
dfv3 = pd.read_csv(csv_path)

dfv3.head()

Unnamed: 0,HOSPITAL_EXPIRE_FLAG,subject_id,hadm_id,icustay_id,HeartRate_Min,HeartRate_Max,HeartRate_Mean,SysBP_Min,SysBP_Max,SysBP_Mean,...,MARITAL_STATUS_MARRIED,MARITAL_STATUS_SEPARATED,MARITAL_STATUS_SINGLE,MARITAL_STATUS_UNKNOWN (DEFAULT),MARITAL_STATUS_WIDOWED,FIRST_CAREUNIT_CCU,FIRST_CAREUNIT_CSRU,FIRST_CAREUNIT_MICU,FIRST_CAREUNIT_SICU,FIRST_CAREUNIT_TSICU
0,0.0,77502,151200,299699,89.0,116.0,102.677419,97.0,150.0,126.0,...,False,False,False,False,False,False,False,True,False,False
1,0.0,44346,140114,250021,74.0,114.0,92.204082,87.0,160.0,122.0,...,False,False,False,False,False,False,False,False,False,True
2,0.0,92438,118589,288511,59.0,89.0,70.581395,88.0,160.0,120.933333,...,True,False,False,False,False,False,True,False,False,False
3,1.0,83663,125553,278204,75.0,86.0,80.4,74.0,102.0,85.227273,...,True,False,False,False,False,False,False,True,False,False
4,0.0,85941,181409,292581,77.0,107.0,91.020408,95.0,150.0,108.625,...,False,False,False,False,False,False,False,True,False,False


In [3]:
dfv3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 56 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   HOSPITAL_EXPIRE_FLAG              8000 non-null   float64
 1   subject_id                        8000 non-null   int64  
 2   hadm_id                           8000 non-null   int64  
 3   icustay_id                        8000 non-null   int64  
 4   HeartRate_Min                     7167 non-null   float64
 5   HeartRate_Max                     7167 non-null   float64
 6   HeartRate_Mean                    7167 non-null   float64
 7   SysBP_Min                         7160 non-null   float64
 8   SysBP_Max                         7160 non-null   float64
 9   SysBP_Mean                        7160 non-null   float64
 10  DiasBP_Min                        7160 non-null   float64
 11  DiasBP_Max                        7160 non-null   float64
 12  DiasBP

## Lists of features

In [None]:
# Categorical features

categorical_features = dfv3.select_dtypes(include=['bool']).columns.tolist()

# Numerical features

all_numerical_features= dfv3.select_dtypes(include=['number']).columns.tolist()

not_numerical = ['HOSPITAL_EXPIRE_FLAG', 'subject_id', 'hadm_id', 'icustay_id', 'train'] 

numerical_features = [col for col in all_numerical_features if col not in not_numerical + categorical_features]

print(f'Number of numerical features: {len(numerical_features)}\n', numerical_features, '\n')

print(f'Number of categorical features: {len(categorical_features)}\n', categorical_features, '\n')

print(f'Total number of features: {len(numerical_features) + len(categorical_features)}')

Number of numerical features: 29
 ['HeartRate_Min', 'HeartRate_Max', 'HeartRate_Mean', 'SysBP_Min', 'SysBP_Max', 'SysBP_Mean', 'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 'MeanBP_Min', 'MeanBP_Max', 'MeanBP_Mean', 'RespRate_Min', 'RespRate_Max', 'RespRate_Mean', 'TempC_Min', 'TempC_Max', 'TempC_Mean', 'SpO2_Min', 'SpO2_Max', 'SpO2_Mean', 'Glucose_Min', 'Glucose_Max', 'Glucose_Mean', 'cci_index', 'icustays_per_hospstay', 'icustays_per_subject', 'number_comorbidities', 'age_admission'] 

Number of categorical features: 22
 ['GENDER_F', 'GENDER_M', 'ADMISSION_TYPE_ELECTIVE', 'ADMISSION_TYPE_EMERGENCY', 'ADMISSION_TYPE_URGENT', 'INSURANCE_Government', 'INSURANCE_Medicaid', 'INSURANCE_Medicare', 'INSURANCE_Private', 'INSURANCE_Self Pay', 'MARITAL_STATUS_DIVORCED', 'MARITAL_STATUS_LIFE PARTNER', 'MARITAL_STATUS_MARRIED', 'MARITAL_STATUS_SEPARATED', 'MARITAL_STATUS_SINGLE', 'MARITAL_STATUS_UNKNOWN (DEFAULT)', 'MARITAL_STATUS_WIDOWED', 'FIRST_CAREUNIT_CCU', 'FIRST_CAREUNIT_CSRU', 'FIRST_CAREUNI

Remember: here I have included all of the dummies of the categorical variables, but for models with linearities (e.g., logistic regression) one of these should be dropped. 

## Preliminary considerations: class imbalance
First of all, for evaluating the model it is worth noticing, again, that the dataset is highly imbalanced. The plots below show this characteristic of the dataset, where the first plot shows the relative frequencies of the outcome.

### Imbalancing in the training dataset

In [5]:
# We save the relative frequencies of the 'position_grouped' column
relative_freq = dfv3['HOSPITAL_EXPIRE_FLAG'].value_counts(normalize=True).reset_index()

# We change the naming of the columns
relative_freq.columns = ['Outcome', 'Relative Frequency']

# Create a bar plot with Plotly
fig = px.bar(relative_freq, x='Outcome', y='Relative Frequency',
                title=f'Relative Frequency Barplot of the outcome variable')

# Show the plot
fig.show()

As already shown in the EDA, compared to deaths during the ICU stay, survival (0) is more prevalent. This can be problematic if the relative frequency of the outcome in the test dataset is significantly different.

# 4. Feature and model selection and evaluation
Tasks to do in this section:
1. **Split** the data.
2. Impute **missing values** (if there is no listwise deletion).
3. Clean **outliers** (imputation or deletion).
4. **Standardization** of numerical variables (in case a model needs it, like linear SVM). Apply same standardization to validation data. No further transformations will be needed if categorical variables have been OHE or encoded in some other way.
5. Deal with **class imbalancing** for the training data (from `imblearn` - `over_sampling`, `under_sampling`, import `RandomUnderSampler`, `RandomOverSampler` and `SMOTE`).
6. **Hyperparameter tuning** (grid search or randomized search - `RandomizedSearchCV`).
7. Evaluate model through **cross-validation** (considering class imbalancing, stratified cross-validation can be a good option).
8. Rank models with **evaluation metrics** (mainly: AUC, but could also use precision, recall and F1-score).
9. After finding the best-performing model (in this case, guided by the best AUC), train the best model with the whole training dataset and the optimal hyperparameters.

## Some tips

How and when to apply resampling methodologies:
- Resampling methodologies should be applied after the train-validation split.
- Explanation:
    - Avoid information leakage: Resampling before splitting can cause synthetic data (oversampling) or missing data (undersampling) from the same original observations to appear in both train and validation sets. This results in overly optimistic validation performance.
    - True model evaluation: Resampling alters the class distribution, which should only affect the training data. The validation set should reflect the original distribution for an accurate evaluation of how the model would perform in the real world.
- Process:
    - Split the data into training and validation sets.
    - Preprocessing and feature engineering (so cleaning missing values, outliers and standardizing come BEFORE the application of resampling techniques, which can be affected if these transformations are not applied before). Concretely, the application of SMOTE would be biased if numerical features are not scaled beforehand.
    - Apply resampling techniques (oversampling, undersampling, or SMOTE) only on the training set (as a reference, see https://imbalanced-learn.org/stable/common_pitfalls.html).
    - Train the model on the resampled training set.
    - Evaluate the model on the original, untouched validation set.

Stratified cross-validation and resampling. Does it make sense to apply both of them?
- Without Resampling: Stratified cross-validation is ideal for imbalanced datasets because it ensures that each fold maintains the same class distribution as the original dataset.
- With Resampling: Stratification is less relevant because the class distribution in the training folds will be altered by resampling. However, you can still use stratified cross-validation on the unresampled validation set to evaluate the model. The goal here is to maintain the real-world class distribution in the validation set while resampling the training data within each fold.
- Suggested Strategy:
    - Use stratified splits for train-validation splitting.
    - Apply resampling within the training folds during each iteration of cross-validation.

## General training function

In [47]:
def baseline_models(
        df: pd.DataFrame, numerical_features: list[str], categorical_features: list[str], 
        target: str, model, ada_boost: bool = False, n_hyper_comb: int = 10, num_imputer: str = "median",
        refit_score: str = 'roc_auc', resampling_method: str = 'none'
):
    
    """
    Function to evaluate baseline machine learning models for a classification task, 
    with hyperparameter tuning using RandomizedSearchCV. It preprocesses numerical and 
    categorical features, performs model fitting, and evaluates performance on the entire 
    dataset. Additionally, it supports handling of class imbalance through various resampling 
    methods (oversampling, undersampling, SMOTE).

    Parameters:
    -----------
    df : pd.DataFrame
        The input dataset containing both numerical and categorical features as well as the target variable.
        
    numerical_features : list[str]
        List of column names representing numerical features in the dataset.

    categorical_features : list[str]
        List of column names representing categorical features in the dataset.
        
    target : str
        The name of the target variable (i.e., the variable to be predicted).
    
    model : class
        The model class to be used for training and evaluation (DecisionTreeClassifier, 
        SVC, RandomForestClassifier, GradientBoostingClassifier, LGBMClassifier).
    
    ada_boost : bool, default = False
        Implement adaptive boosting (can only be applied to decision trees).  

    n_hyper_comb : int, default=10
        The number of hyperparameter combinations to sample for RandomizedSearchCV.
        
    num_imputer : str, default="median"
        The strategy to use for imputing missing values in numerical features (e.g., "mean", "median").
    
    refit_score : str, default="roc_auc"
        Score based on which the RandomizedSearchCV selects the optimal hyperparameters.
        Allowed scores are: ('f1_weighted', 'f1_micro', 'f1_macro', 'roc_auc').
        
    resampling_method : str, default='none'
        Method used to address class imbalance:
        - 'oversample' for RandomOverSampler
        - 'undersample' for RandomUnderSampler
        - 'smote' for SMOTE (Synthetic Minority Over-sampling Technique)
        - 'none' for no resampling.

    Returns:
    --------

    optimal_hyperparameters : dict
        A dictionary of the optimal hyperparameters found by RandomizedSearchCV.

    cv_best_scores : dict
        A dictionary of best cross-validation scores for the model. Includes:
        - 'f1_micro': Micro-average F1 score (global metric with total TP, FN and FP)
        - 'f1_macro': Macro-average F1 score (metric for each class, find unweighted mean - 
        doesn't take label imbalance into account)
        - 'roc_auc': non-weighted ROC AUC score
    """

    ###########################################################################

    # First, we create the pipeline for preprocessing numerical and categorical transformers

    # For numerical variables, we do simple imputation
    num_prepr = [
        ("imputer", SimpleImputer(strategy = num_imputer))
    ]

    # We add standardization for certain models that benefit from it
    if model == SVC or resampling_method == 'smote':
        num_prepr.append(("standardize", StandardScaler()))

    # For categorical variables, for now, we don't do anything (pass through)
    cat_prepr = [
        ("passthrough", "passthrough")
    ]

    # Create the ColumnTransformer to apply different transformations to numerical
    # and categorical variables
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline(num_prepr), numerical_features),
            ('cat', Pipeline(cat_prepr), categorical_features),
        ]
    )

    # Create steps of the pipeline, which begins with the preprocessor. 
    steps = [('preprocessor', preprocessor)]

    ###########################################################################

    # Second, we add the resampling step based on the chosen method (resampling after 
    # preprocessing and feature engineering)
    if resampling_method == 'oversample':
        resampling = RandomOverSampler(sampling_strategy = 'auto', random_state=17)
    elif resampling_method == 'undersample':
        resampling = RandomUnderSampler(sampling_strategy = 'auto', random_state=17)
    elif resampling_method == 'smote':
        resampling = SMOTE(sampling_strategy = 'auto', random_state=17)
    elif resampling_method == 'none':
        resampling = None  # No resampling
    else:
        raise ValueError('Invalid resampling type. Insert resampling method "oversample", "undersample", "smote" or "none" for no resampling.')

    if resampling:
        steps.append(('resampling', resampling))

    ###########################################################################

    # Third, depending on the model that has been chosen, we append one model
    # or another, and define the set of hyperparameters accordingly

    # We check if the model is one of the allowed model classes
    allowed_models = (DecisionTreeClassifier, SVC,
                      RandomForestClassifier, GradientBoostingClassifier, LGBMClassifier)

    # We consider the possibility of applying adaptive boosting to the chosen model

    if ada_boost == False:
        
        if model == SVC:
            steps.append(('svc', 
                        model(
                            random_state = 17,
                            class_weight = None,
                            probability = False  # In the hyperparameter tuning stage, we won't predict probabilities (as it significantly slows down training)
                        )))
            
            distributions = {
                'svc__C': loguniform(1e-3, 1e3),  # Regularization parameter: smaller values allow more margin violations (simpler, potentially underfitting model), larger values enforce stricter separation.
                'svc__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  # Choice of kernel function: it is used to transform data into a higher-dimensional space
                'svc__gamma': loguniform(1e-4, 1e1),  # Defines how far the influence of a single training example reaches. A lower value considers more distant points, while a higher value focuses on near points (only for ‘rbf’, ‘poly’, and ‘sigmoid’)
                'svc__degree': randint(2, 5),  # Degree of polynomial kernel (only used if kernel='poly')
                'svc__coef0': uniform(loc=0.0, scale=1.0),  # Used in 'poly' and 'sigmoid' kernels, controls the influence of higher-order terms
                'svc__tol': loguniform(1e-5, 1e-1),  # Tolerance for stopping criterion: lower values make the optimization stricter.
            }

        elif model == DecisionTreeClassifier:
            steps.append(('dtc', 
                                    model(
                                    random_state = 17,
                                    class_weight = None
                                    )))
            distributions = {
            'dtc__max_depth': randint(low = 1, high = 100),
            'dtc__max_features': randint(low = 1, high = (len(df[numerical_features + categorical_features].columns))),
            'dtc__max_leaf_nodes': randint(low = 1, high = 5000),
            'dtc__min_samples_leaf': uniform(loc=0.001, scale=0.199),
            'dtc__min_samples_split': uniform(loc=0.001, scale=0.199),            
            }
        
        elif model == RandomForestClassifier:
            steps.append(('rfc', 
                                    model(
                                    n_estimators = 1000,
                                    random_state = 17,
                                    oob_score = False,
                                    class_weight = None,
                                    n_jobs = -1
                                    )))
            distributions = {
            'rfc__max_depth': randint(low = 1, high = 300),
            'rfc__max_features': randint(low = 1, high = (len(df[numerical_features + categorical_features].columns))),
            'rfc__max_leaf_nodes': randint(low = 1, high = 5000),
            'rfc__min_samples_leaf': uniform(loc=0.001, scale=0.199),
            'rfc__min_samples_split': uniform(loc=0.001, scale=0.199),
            }
        
        elif model == GradientBoostingClassifier:
            steps.append(('gbc', 
                        model(
                            loss = 'log_loss',
                            random_state=17
                        )))
            distributions = {
                'gbc__learning_rate': [0.1, 0.5, 1.0],
                'gbc__max_depth': randint(low = 1, high = 300),
                'gbc__max_features': randint(low = 1, high = (len(df[numerical_features + categorical_features].columns))),
                'gbc__max_leaf_nodes': randint(low = 1, high = 5000),
                'gbc__min_samples_leaf': uniform(loc=0.001, scale=0.199),
                'gbc__min_samples_split': uniform(loc=0.001, scale=0.199),
                'gbc__n_estimators': [50, 100, 500],
                'gbc__subsample': uniform(loc=0.7, scale=0.3) # Subsample ratio (proportion of training samples to be used for training each estimator)
            }

        elif model == LGBMClassifier:
            steps.append(('lgb',
                        model(
                            objective = 'binary',
                            n_estimators = 100, # Number of boosting iterations (since LightGBM uses decision trees as the learners, this can also be thought of as “number of trees”. Same as n_estimators)
                            max_depth = -1, # No limit max_depth (default), set to this value as LightGBM focuses on leaf-wise tree growth algorithm
                            subsample = 1.0,  # In each iteration, train with all the training instances (since we already do CV)
                            device_type = 'cpu', # Change this parameter if prefer CPU to run the algorithm over GPU (though, since the dataset is small, in this case CPU might perform better)
                            verbosity = -1 # We disable all the information except for fatal messages
                    )))
            distributions = {
                'lgb__boosting_type': ['gbdt', 'rf'], # 'gbdt': Gradient Boosting Decision Tree. For boosting with Random Forests: 'rf' instead.
                'lgb__feature_fraction': uniform(loc = 0.5, scale = 0.5), # LightGBM will randomly select a subset of features on each iteration (tree) if feature_fraction is smaller than 1.0. Speeds up training and deals with overfitting
                'lgb__learning_rate': [0.01, 0.15, 0.3], # Lower values (e.g., 0.01) often yield better performance but require more iterations.
                'lgb__min_child_samples': randint(low = 5, high = 30), # Higher values prevent overfitting by ensuring leaves have enough data.
                'lgb__min_split_gain': uniform(loc = 0.0, scale = 0.1), # Minimum loss reduction required to make a further partition on a leaf node of the tree.
                'lgb__num_leaves': [15, 31, 63, 127], # Larger values increase model complexity and the risk of overfitting.
                'lgb__reg_lambda': uniform(loc = 0.0, scale = 1.0) # Ridge regularization parameter (alpha, default is 0)
            }

        else:
            raise ValueError(f"Invalid model type. Allowed models are: {allowed_models}")
    
    else:  # Apply adaptive boosting (only to DecisionTreeClassifier)

        if model == DecisionTreeClassifier:
            base_estimator = model(
                random_state = 17,
                class_weight = None
            )
            ada_boost = AdaBoostClassifier(
                estimator = base_estimator,
                algorithm = 'SAMME',
                random_state = 17
            )
            steps.append(('adaboost', ada_boost))
            distributions = {
            'adaboost__estimator__max_depth': randint(low = 1, high = 100),
            'adaboost__estimator__max_features': randint(low = 1, high = (len(df[numerical_features + categorical_features].columns))),
            'adaboost__estimator__max_leaf_nodes': randint(low = 1, high = 5000),
            'adaboost__estimator__min_samples_leaf': uniform(loc=0.001, scale=0.199),
            'adaboost__estimator__min_samples_split': uniform(loc=0.001, scale=0.199),
            'adaboost__learning_rate': [0.1, 0.5, 1.0],
            'adaboost__n_estimators': [50, 100, 500]
            }

        else:
            raise ValueError("Only Decision Trees are allowed for AdaBoost (Random Forests are not due to use of computing resources).")
    
    ###########################################################################

    # Fourth, we create the pipeline with the preprocessing, resampling and the model
    pipeline = ImbPipeline(steps) # We use Pipeline from imblearn instead of sklearn's

    ###########################################################################

    # Fifth, we split the dataset into features and outcome

    # We divide the data frames depending on the variables

    X = df[numerical_features + categorical_features]
    y = df[target]

    ###########################################################################

    # Sixth, we do the randomized search with the model
    
    scoring_metrics = ['f1_micro', 'f1_macro', 'roc_auc']

    # Raise an exception if the refit_score is not one of the scoring_metrics
    if refit_score not in scoring_metrics:
        raise ValueError(f"Invalid score for finding optimal hyperparameters. Allowed scores are: {scoring_metrics}")

    clf = RandomizedSearchCV(
        estimator = pipeline, 
        param_distributions = distributions,
        n_iter = n_hyper_comb, # Default is 10
        scoring = scoring_metrics,
        n_jobs = -1, # We use all available processors
        cv = 5, # Use 5 stratified cross-validation folds
        verbose = 3, # Show computation time and score for each fold and parameter candidate
        refit = refit_score, # Refit an estimator using the best found parameters on the whole dataset with the best score found (by default, roc_auc)
        random_state = 17, 
        error_score = np.nan, 
        return_train_score = False)

    search = clf.fit(X, y)

    ###########################################################################

    # Seventh, we save the main results from the search

    # Best scores obtained with best hyperparameters through stratified cross-val
    cv_best_scores = {
        'f1_micro': search.cv_results_['mean_test_f1_micro'][search.best_index_], # Calculates metrics globally by counting total true positives, false negatives, and false positives
        'f1_macro': search.cv_results_['mean_test_f1_macro'][search.best_index_], # Calculates metrics for each class independently and averages them without considering class imbalance
        'roc_auc': search.cv_results_['mean_test_roc_auc'][search.best_index_]
    }

    # Best hyperparameters from the search
    optimal_hyperparameters = search.best_params_

    ###########################################################################

    # Eigth, we return the results (metrics for classes computed for the training
    # set), including the best hyperparameters found

    return optimal_hyperparameters, cv_best_scores

Useful documentation:
- Randomized Search: https://scikit-learn.org/1.5/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
- Scoring: https://scikit-learn.org/1.5/modules/model_evaluation.html#scoring-parameter. See also different types of f1 scores in: https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
- Pipeline: 
    - sklearn pipeline: https://scikit-learn.org/1.5/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
    - imblearn pipeline: https://imbalanced-learn.org/stable/references/generated/imblearn.pipeline.Pipeline.html#imblearn.pipeline.Pipeline. This is the pipeline that has been used to integrate the preprocessing with the imbalancing classes from imblearn and the model fitting of sklearn, to guarantee compatibility and correct application of resampling techniques. In the function above:
        1. Resampling is applied to the training data (before fitting the model).
        2. Preprocessing (imputation, scaling, etc.) comes after resampling, and it should be applied to the training data after resampling and to the test data without resampling.
        3. Model fitting occurs after the data is both resampled and preprocessed.
- Models:
    - Support Vector Classifier: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
    - Decision trees in sklearn: https://scikit-learn.org/1.5/modules/tree.html. Also, see https://scikit-learn.org/1.5/modules/generated/sklearn.tree.DecisionTreeClassifier.html
    - Boosting:
        - AdaBoostClassifier: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
        - GradientBoostingClassifier: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier. No need for standardization as it is based on decision trees.
        - LightGBM: https://lightgbm.readthedocs.io/en/stable/. No need for standardization as it is based on decision trees. Some interesting features:
            - It can work directly with categorical features w/o OHE (higher speed of training).
            - Parameters for LGBMClassifier: https://lightgbm.readthedocs.io/en/stable/pythonapi/lightgbm.LGBMClassifier.html#lightgbm.LGBMClassifier
            - Additional parameters: https://lightgbm.readthedocs.io/en/stable/Parameters.html
            - Tips for parameter-tuning: https://lightgbm.readthedocs.io/en/stable/Parameters-Tuning.html
- Dealing with class imbalancing: https://imbalanced-learn.org/stable/index.html. See [RandomOverSampler](https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.RandomOverSampler.html), [RandomUnderSampler](https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.RandomUnderSampler.html#randomundersampler) and [SMOTE](https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html).

## Support Vector Machines

Note that in this case we cannot use linear SVMs, which don't allow for probability prediction. If we want a close-to-linear SVM, we should use the `'linear'` kernel from the `SVC` class. Also, consider that the SVC model scales very  badly as the number of training instances increases, which makes the hyperparameter tuning below significantly slow.

#### A. No resampling (without considering class imbalance)

In [None]:
# best_params_svc, best_scores_svc = baseline_models(
#         df=dfv3, numerical_features=numerical_features, categorical_features=categorical_features,
#         target='HOSPITAL_EXPIRE_FLAG', model=SVC, ada_boost = False, 
#         n_hyper_comb = 50, num_imputer = "median", refit_score = 'roc_auc', resampling_method = 'none'
# )

# # Combine dictionaries into a list
# combined_svc = [best_params_svc, best_scores_svc]

# # Save into a pickle (name of file states model, if resampling technique was
# # applied and the number of hyperparameter combinations tried)
# with open('hyperparameter_tuning/svc/svc_noresampling_50comb.pkl', 'wb') as f:
#     pickle.dump(combined_svc, f)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END svc__C=2.0441285868166177, svc__coef0=0.597513253239051, svc__degree=2, svc__gamma=0.007749439531247432, svc__kernel=poly, svc__tol=0.00015520398687464502; f1_macro: (test=0.576) f1_micro: (test=0.896) roc_auc: (test=0.813) total time=   2.3s
[CV 2/5] END svc__C=8.669641328354105, svc__coef0=0.6375208960436358, svc__degree=3, svc__gamma=0.00015678863564170365, svc__kernel=rbf, svc__tol=1.4624810965454257e-05; f1_macro: (test=0.481) f1_micro: (test=0.887) roc_auc: (test=0.826) total time=   2.4s
[CV 2/5] END svc__C=28.536277882830053, svc__coef0=0.5704173584583558, svc__degree=4, svc__gamma=0.001195545404013359, svc__kernel=rbf, svc__tol=0.0040707502841458585; f1_macro: (test=0.577) f1_micro: (test=0.899) roc_auc: (test=0.831) total time=   2.4s
[CV 3/5] END svc__C=8.669641328354105, svc__coef0=0.6375208960436358, svc__degree=3, svc__gamma=0.00015678863564170365, svc__kernel=rbf, svc__tol=1.4624810965454257e-05; 

With 10 hyperparameter combinations, it takes more than 30 minutes. With 50, it takes approximately 1h and 30 mins.

In [31]:
# Load saved pickle
with open('hyperparameter_tuning/svc/svc_noresampling_10comb.pkl', 'rb') as f:
    loaded = pickle.load(f)
    best_params_svc, best_scores_svc = loaded

print('Best parameters of SVC:', best_params_svc, '\n')

print('Best scores of SVC:', best_scores_svc, '\n')

Best parameters of SVC: {'svc__C': 28.536277882830053, 'svc__coef0': 0.5704173584583558, 'svc__degree': 4, 'svc__gamma': 0.001195545404013359, 'svc__kernel': 'rbf', 'svc__tol': 0.0040707502841458585} 

Best scores of SVC: {'f1_micro': 0.896875, 'f1_macro': 0.573116856686093, 'roc_auc': 0.8209834794050668} 



In [56]:
# Load saved pickle
with open('hyperparameter_tuning/svc/svc_noresampling_50comb.pkl', 'rb') as f:
    loaded = pickle.load(f)
    best_params_svc, best_scores_svc = loaded

print('Best parameters of SVC:', best_params_svc, '\n')

print('Best scores of SVC:', best_scores_svc, '\n')

Best parameters of SVC: {'svc__C': 28.536277882830053, 'svc__coef0': 0.5704173584583558, 'svc__degree': 4, 'svc__gamma': 0.001195545404013359, 'svc__kernel': 'rbf', 'svc__tol': 0.0040707502841458585} 

Best scores of SVC: {'f1_micro': 0.896875, 'f1_macro': 0.573116856686093, 'roc_auc': 0.8209834794050668} 



## Random Forest

#### A. No resampling (without considering class imbalance)

In [32]:
best_params_rf, best_scores_rf = baseline_models(
        df=dfv3, numerical_features=numerical_features, categorical_features=categorical_features,
        target='HOSPITAL_EXPIRE_FLAG', model=RandomForestClassifier, ada_boost = False, 
        n_hyper_comb = 60, num_imputer = "median", refit_score = 'roc_auc', resampling_method = 'none'
)

# Combine dictionaries into a list
combined_rf = [best_params_rf, best_scores_rf]

# Save into a pickle (name of file states model, if resampling technique was
# applied and the number of hyperparameter combinations tried)
with open('hyperparameter_tuning/random_forest/rf_noresampling_60comb.pkl', 'wb') as f:
    pickle.dump(combined_rf, f)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[CV 4/5] END rfc__max_depth=279, rfc__max_features=32, rfc__max_leaf_nodes=3624, rfc__min_samples_leaf=0.12786665831268354, rfc__min_samples_split=0.11554497585685376; f1_macro: (test=0.470) f1_micro: (test=0.886) roc_auc: (test=0.750) total time=  10.8s
[CV 4/5] END rfc__max_depth=112, rfc__max_features=50, rfc__max_leaf_nodes=2192, rfc__min_samples_leaf=0.16900253458405015, rfc__min_samples_split=0.1778840998151315; f1_macro: (test=0.470) f1_micro: (test=0.886) roc_auc: (test=0.727) total time=  12.1s
[CV 5/5] END rfc__max_depth=112, rfc__max_features=50, rfc__max_leaf_nodes=2192, rfc__min_samples_leaf=0.16900253458405015, rfc__min_samples_split=0.1778840998151315; f1_macro: (test=0.470) f1_micro: (test=0.886) roc_auc: (test=0.705) total time=  12.2s
[CV 5/5] END rfc__max_depth=279, rfc__max_features=32, rfc__max_leaf_nodes=3624, rfc__min_samples_leaf=0.12786665831268354, rfc__min_samples_split=0.11554497585685376; f1_macro: (test=0.470) f1_micro: (test=0.886) roc_auc: (test=0.717) t

With 60 hyperparameter combinations, it takes approximately 4 minutes.

In [33]:
# Load saved pickle
with open('hyperparameter_tuning/random_forest/rf_noresampling_60comb.pkl', 'rb') as f:
    loaded = pickle.load(f)
    best_params_rf, best_scores_rf = loaded

print('Best parameters of RF:', best_params_rf, '\n')

print('Best scores of RF:', best_scores_rf, '\n')

Best parameters of RF: {'rfc__max_depth': 48, 'rfc__max_features': 7, 'rfc__max_leaf_nodes': 3096, 'rfc__min_samples_leaf': 0.0022275055277347616, 'rfc__min_samples_split': 0.039801263866007214} 

Best scores of RF: {'f1_micro': 0.890375, 'f1_macro': 0.5097113890181431, 'roc_auc': 0.8229868151826787} 



#### B. Oversampling

In [40]:
best_params_rf_over, best_scores_rf_over = baseline_models(
        df=dfv3, numerical_features=numerical_features, categorical_features=categorical_features,
        target='HOSPITAL_EXPIRE_FLAG', model=RandomForestClassifier, ada_boost = False, 
        n_hyper_comb = 50, num_imputer = "median", refit_score = 'roc_auc', resampling_method = 'oversample'
)

# Combine dictionaries into a list
combined_rf_over = [best_params_rf_over, best_scores_rf_over]

# Save into a pickle (name of file states model, if resampling technique was
# applied and the number of hyperparameter combinations tried)
with open('hyperparameter_tuning/random_forest/rf_over_50comb.pkl', 'wb') as f:
    pickle.dump(combined_rf_over, f)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 3/5] END rfc__max_depth=239, rfc__max_features=18, rfc__max_leaf_nodes=3114, rfc__min_samples_leaf=0.009213240498236963, rfc__min_samples_split=0.1487706779610283; f1_macro: (test=0.575) f1_micro: (test=0.691) roc_auc: (test=0.802) total time=  16.4s
[CV 1/5] END rfc__max_depth=279, rfc__max_features=32, rfc__max_leaf_nodes=3624, rfc__min_samples_leaf=0.12786665831268354, rfc__min_samples_split=0.11554497585685376; f1_macro: (test=0.556) f1_micro: (test=0.677) roc_auc: (test=0.752) total time=  16.7s
[CV 3/5] END rfc__max_depth=279, rfc__max_features=32, rfc__max_leaf_nodes=3624, rfc__min_samples_leaf=0.12786665831268354, rfc__min_samples_split=0.11554497585685376; f1_macro: (test=0.522) f1_micro: (test=0.614) roc_auc: (test=0.751) total time=  16.9s
[CV 1/5] END rfc__max_depth=112, rfc__max_features=50, rfc__max_leaf_nodes=2192, rfc__min_samples_leaf=0.16900253458405015, rfc__min_samples_split=0.1778840998151315; f1_mac

With 50 hyperparameter combinations, it takes approximately 4 minutes.

In [43]:
# Load saved pickle
with open('hyperparameter_tuning/random_forest/rf_over_50comb.pkl', 'rb') as f:
    loaded = pickle.load(f)
    best_params_rf_over, best_scores_rf_over = loaded

print('Best parameters of RF:', best_params_rf_over, '\n')

print('Best scores of RF:', best_scores_rf_over, '\n')

Best parameters of RF: {'rfc__max_depth': 48, 'rfc__max_features': 7, 'rfc__max_leaf_nodes': 3096, 'rfc__min_samples_leaf': 0.0022275055277347616, 'rfc__min_samples_split': 0.039801263866007214} 

Best scores of RF: {'f1_micro': 0.7652499999999999, 'f1_macro': 0.6324690971563267, 'roc_auc': 0.8287546650043126} 



Note how the F1-score for the minority class has increased significantly! (see F1-macro).

#### C. SMOTE

In [None]:
# best_params_rf_smote, best_scores_rf_smote = baseline_models(
#         df=dfv3, numerical_features=numerical_features, categorical_features=categorical_features,
#         target='HOSPITAL_EXPIRE_FLAG', model=RandomForestClassifier, ada_boost = False, 
#         n_hyper_comb = 50, num_imputer = "median", refit_score = 'roc_auc', resampling_method = 'smote'
# )

# # Combine dictionaries into a list
# combined_rf_smote = [best_params_rf_smote, best_scores_rf_smote]

# # Save into a pickle (name of file states model, if resampling technique was
# # applied and the number of hyperparameter combinations tried)
# with open('hyperparameter_tuning/random_forest/rf_smote_50comb.pkl', 'wb') as f:
#     pickle.dump(combined_rf_smote, f)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 2/5] END rfc__max_depth=112, rfc__max_features=50, rfc__max_leaf_nodes=2192, rfc__min_samples_leaf=0.16900253458405015, rfc__min_samples_split=0.1778840998151315; f1_macro: (test=0.494) f1_micro: (test=0.631) roc_auc: (test=0.625) total time=  23.9s
[CV 1/5] END rfc__max_depth=279, rfc__max_features=32, rfc__max_leaf_nodes=3624, rfc__min_samples_leaf=0.12786665831268354, rfc__min_samples_split=0.11554497585685376; f1_macro: (test=0.515) f1_micro: (test=0.669) roc_auc: (test=0.635) total time=  26.1s
[CV 4/5] END rfc__max_depth=279, rfc__max_features=32, rfc__max_leaf_nodes=3624, rfc__min_samples_leaf=0.12786665831268354, rfc__min_samples_split=0.11554497585685376; f1_macro: (test=0.505) f1_micro: (test=0.641) roc_auc: (test=0.642) total time=  26.9s
[CV 1/5] END rfc__max_depth=112, rfc__max_features=50, rfc__max_leaf_nodes=2192, rfc__min_samples_leaf=0.16900253458405015, rfc__min_samples_split=0.1778840998151315; f1_macr

With 50 hyperparameter combinations, it takes approximately 6 minutes.

In [None]:
# Load saved pickle
with open('hyperparameter_tuning/random_forest/rf_smote_50comb.pkl', 'rb') as f:
    loaded = pickle.load(f)
    best_params_rf_smote, best_scores_rf_smote = loaded

print('Best parameters of RF:', best_params_rf_smote, '\n')

print('Best scores of RF:', best_scores_rf_smote, '\n')

Best parameters of RF: {'rfc__max_depth': 48, 'rfc__max_features': 7, 'rfc__max_leaf_nodes': 3096, 'rfc__min_samples_leaf': 0.0022275055277347616, 'rfc__min_samples_split': 0.039801263866007214} 

Best scores of RF: {'f1_micro': 0.8383749999999999, 'f1_macro': 0.6304874715421158, 'roc_auc': 0.7898840135568947} 



#### D. Undersampling

In [None]:
# best_params_rf_under, best_scores_rf_under = baseline_models(
#         df=dfv3, numerical_features=numerical_features, categorical_features=categorical_features,
#         target='HOSPITAL_EXPIRE_FLAG', model=RandomForestClassifier, ada_boost = False, 
#         n_hyper_comb = 50, num_imputer = "median", refit_score = 'roc_auc', resampling_method = 'undersample'
# )

# # Combine dictionaries into a list
# combined_rf_under = [best_params_rf_under, best_scores_rf_under]

# # Save into a pickle (name of file states model, if resampling technique was
# # applied and the number of hyperparameter combinations tried)
# with open('hyperparameter_tuning/random_forest/rf_under_50comb.pkl', 'wb') as f:
#     pickle.dump(combined_rf_under, f)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[CV 4/5] END rfc__max_depth=279, rfc__max_features=32, rfc__max_leaf_nodes=3624, rfc__min_samples_leaf=0.12786665831268354, rfc__min_samples_split=0.11554497585685376; f1_macro: (test=0.548) f1_micro: (test=0.664) roc_auc: (test=0.760) total time=   3.3s
[CV 5/5] END rfc__max_depth=279, rfc__max_features=32, rfc__max_leaf_nodes=3624, rfc__min_samples_leaf=0.12786665831268354, rfc__min_samples_split=0.11554497585685376; f1_macro: (test=0.524) f1_micro: (test=0.631) roc_auc: (test=0.736) total time=   3.5s
[CV 2/5] END rfc__max_depth=279, rfc__max_features=32, rfc__max_leaf_nodes=3624, rfc__min_samples_leaf=0.12786665831268354, rfc__min_samples_split=0.11554497585685376; f1_macro: (test=0.516) f1_micro: (test=0.625) roc_auc: (test=0.728) total time=   3.6s
[CV 3/5] END rfc__max_depth=279, rfc__max_features=32, rfc__max_leaf_nodes=3624, rfc__min_samples_leaf=0.12786665831268354, rfc__min_samples_split=0.11554497585685376; f1_macro: (test=0.560) f1_micro: (test=0.678) roc_auc: (test=0.750)

With 50 hyperparameter combinations, it takes approximately 1 minute.

In [45]:
# Load saved pickle
with open('hyperparameter_tuning/random_forest/rf_under_50comb.pkl', 'rb') as f:
    loaded = pickle.load(f)
    best_params_rf_under, best_scores_rf_under = loaded

print('Best parameters of RF:', best_params_rf_under, '\n')

print('Best scores of RF:', best_scores_rf_under, '\n')

Best parameters of RF: {'rfc__max_depth': 48, 'rfc__max_features': 7, 'rfc__max_leaf_nodes': 3096, 'rfc__min_samples_leaf': 0.0022275055277347616, 'rfc__min_samples_split': 0.039801263866007214} 

Best scores of RF: {'f1_micro': 0.72525, 'f1_macro': 0.6051164956848396, 'roc_auc': 0.8231712548770524} 



## GradientBoostingClassifier

#### A. No resampling (without considering class imbalance)

In [None]:
best_params_gboost, best_scores_gboost = baseline_models(
        df=dfv3, numerical_features=numerical_features, categorical_features=categorical_features,
        target='HOSPITAL_EXPIRE_FLAG', model=GradientBoostingClassifier, ada_boost = False, 
        n_hyper_comb = 100, num_imputer = "median", refit_score = 'roc_auc', resampling_method = 'none'
)

# Combine dictionaries into a list
combined_gboost = [best_params_gboost, best_scores_gboost]

# Save into a pickle (name of file states model, if resampling technique was
# applied and the number of hyperparameter combinations tried)
with open('hyperparameter_tuning/gboost/gboost_noresampling_100comb.pkl', 'wb') as f:
    pickle.dump(combined_gboost, f)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[CV 5/5] END gbc__learning_rate=0.5, gbc__max_depth=144, gbc__max_features=7, gbc__max_leaf_nodes=407, gbc__min_samples_leaf=0.1778840998151315, gbc__min_samples_split=0.16155744462914845, gbc__n_estimators=50, gbc__subsample=0.7308981574420352; f1_macro: (test=0.537) f1_micro: (test=0.887) roc_auc: (test=0.764) total time=   0.2s
[CV 1/5] END gbc__learning_rate=0.5, gbc__max_depth=144, gbc__max_features=7, gbc__max_leaf_nodes=407, gbc__min_samples_leaf=0.1778840998151315, gbc__min_samples_split=0.16155744462914845, gbc__n_estimators=50, gbc__subsample=0.7308981574420352; f1_macro: (test=0.544) f1_micro: (test=0.891) roc_auc: (test=0.773) total time=   0.3s
[CV 3/5] END gbc__learning_rate=0.5, gbc__max_depth=144, gbc__max_features=7, gbc__max_leaf_nodes=407, gbc__min_samples_leaf=0.1778840998151315, gbc__min_samples_split=0.16155744462914845, gbc__n_estimators=50, gbc__subsample=0.7308981574420352; f1_macro: (test=0.526) f1_micro: (test=0.889) roc_auc: (test=0.799) total time=   0.2s
[

With 40 hyperparameter combinations, it takes approximately 2 minutes.

In [51]:
# Load saved pickle
with open('hyperparameter_tuning/gboost/gboost_noresampling_50comb.pkl', 'rb') as f:
    loaded = pickle.load(f)
    best_params_gboost, best_scores_gboost = loaded

print('Best parameters of Gradient Boosting:', best_params_gboost, '\n')

print('Best scores of Gradient Boosting:', best_scores_gboost, '\n')

Best parameters of Gradient Boosting: {'gbc__learning_rate': 0.1, 'gbc__max_depth': 290, 'gbc__max_features': 10, 'gbc__max_leaf_nodes': 1552, 'gbc__min_samples_leaf': 0.0498967289710591, 'gbc__min_samples_split': 0.1494714531944152, 'gbc__n_estimators': 100, 'gbc__subsample': 0.8225988703947174} 

Best scores of Gradient Boosting: {'f1_micro': 0.89925, 'f1_macro': 0.6308805699227621, 'roc_auc': 0.8512078940716947} 



## LGBMClassifier

#### A. No resampling (without considering class imbalance)

In [52]:
best_params_lgbm, best_scores_lgbm = baseline_models(
        df=dfv3, numerical_features=numerical_features, categorical_features=categorical_features,
        target='HOSPITAL_EXPIRE_FLAG', model=LGBMClassifier, ada_boost = False, 
        n_hyper_comb = 100, num_imputer = "median", refit_score = 'roc_auc', resampling_method = 'none'
)

# Combine dictionaries into a list
combined_lgbm = [best_params_lgbm, best_scores_lgbm]

# Save into a pickle (name of file states model, if resampling technique was
# applied and the number of hyperparameter combinations tried)
with open('hyperparameter_tuning/lgbm/lgbm_noresampling_100comb.pkl', 'wb') as f:
    pickle.dump(combined_lgbm, f)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 5/5] END lgb__boosting_type=gbdt, lgb__feature_fraction=0.5514969290700589, lgb__learning_rate=0.01, lgb__min_child_samples=12, lgb__min_split_gain=0.06297576132346906, lgb__num_leaves=31, lgb__reg_lambda=0.35781360448354893; f1_macro: (test=0.497) f1_micro: (test=0.889) roc_auc: (test=0.817) total time=   5.8s
[CV 4/5] END lgb__boosting_type=rf, lgb__feature_fraction=0.5908888858951135, lgb__learning_rate=0.3, lgb__min_child_samples=27, lgb__min_split_gain=0.08888648231916156, lgb__num_leaves=31, lgb__reg_lambda=0.7869854599999133; f1_macro: (test=0.642) f1_micro: (test=0.897) roc_auc: (test=0.822) total time=   5.9s
[CV 5/5] END lgb__boosting_type=gbdt, lgb__feature_fraction=0.871283110454845, lgb__learning_rate=0.3, lgb__min_child_samples=11, lgb__min_split_gain=0.021551321485037845, lgb__num_leaves=31, lgb__reg_lambda=0.6524186154656548; f1_macro: (test=0.640) f1_micro: (test=0.894) roc_auc: (test=0.820) total time=

With 100 hyperparameter combinations, it takes approximately 6.5 minutes.

In [54]:
# Load saved pickle
with open('hyperparameter_tuning/lgbm/lgbm_noresampling_100comb.pkl', 'rb') as f:
    loaded = pickle.load(f)
    best_params_lgbm, best_scores_lgbm = loaded

print('Best parameters of LightGBM:', best_params_lgbm, '\n')

print('Best scores of LightGBM:', best_scores_lgbm, '\n')

Best parameters of LightGBM: {'lgb__boosting_type': 'gbdt', 'lgb__feature_fraction': 0.5786047712169723, 'lgb__learning_rate': 0.15, 'lgb__min_child_samples': 6, 'lgb__min_split_gain': 0.03704075485505709, 'lgb__num_leaves': 15, 'lgb__reg_lambda': 0.3606084225042038} 

Best scores of LightGBM: {'f1_micro': 0.9019999999999999, 'f1_macro': 0.659453339505749, 'roc_auc': 0.8512450135095173} 



#### B. Oversampling

In [None]:
# best_params_lgbm_over, best_scores_lgbm_over = baseline_models(
#         df=dfv3, numerical_features=numerical_features, categorical_features=categorical_features,
#         target='HOSPITAL_EXPIRE_FLAG', model=LGBMClassifier, ada_boost = False, 
#         n_hyper_comb = 100, num_imputer = "median", refit_score = 'roc_auc', resampling_method = 'oversample'
# )

# # Combine dictionaries into a list
# combined_lgbm_over = [best_params_lgbm, best_scores_lgbm]

# # Save into a pickle (name of file states model, if resampling technique was
# # applied and the number of hyperparameter combinations tried)
# with open('hyperparameter_tuning/lgbm/lgbm_over_100comb.pkl', 'wb') as f:
#     pickle.dump(combined_lgbm_over, f)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[CV 1/5] END lgb__boosting_type=rf, lgb__feature_fraction=0.5908888858951135, lgb__learning_rate=0.3, lgb__min_child_samples=27, lgb__min_split_gain=0.08888648231916156, lgb__num_leaves=31, lgb__reg_lambda=0.7869854599999133; f1_macro: (test=0.623) f1_micro: (test=0.772) roc_auc: (test=0.804) total time=   0.7s
[CV 4/5] END lgb__boosting_type=rf, lgb__feature_fraction=0.5908888858951135, lgb__learning_rate=0.3, lgb__min_child_samples=27, lgb__min_split_gain=0.08888648231916156, lgb__num_leaves=31, lgb__reg_lambda=0.7869854599999133; f1_macro: (test=0.648) f1_micro: (test=0.782) roc_auc: (test=0.838) total time=   6.1s
[CV 5/5] END lgb__boosting_type=rf, lgb__feature_fraction=0.5908888858951135, lgb__learning_rate=0.3, lgb__min_child_samples=27, lgb__min_split_gain=0.08888648231916156, lgb__num_leaves=31, lgb__reg_lambda=0.7869854599999133; f1_macro: (test=0.625) f1_micro: (test=0.764) roc_auc: (test=0.806) total time=   6.9s
[CV 2/5] END lgb__boosting_type=gbdt, lgb__feature_fraction=0

With 100 hyperparameter combinations, it takes approximately 6.5 minutes.

In [58]:
# Load saved pickle
with open('hyperparameter_tuning/lgbm/lgbm_over_100comb.pkl', 'rb') as f:
    loaded = pickle.load(f)
    best_params_lgbm_over, best_scores_lgbm_over = loaded

print('Best parameters of LightGBM:', best_params_lgbm_over, '\n')

print('Best scores of LightGBM:', best_scores_lgbm_over, '\n')

Best parameters of LightGBM: {'lgb__boosting_type': 'gbdt', 'lgb__feature_fraction': 0.5786047712169723, 'lgb__learning_rate': 0.15, 'lgb__min_child_samples': 6, 'lgb__min_split_gain': 0.03704075485505709, 'lgb__num_leaves': 15, 'lgb__reg_lambda': 0.3606084225042038} 

Best scores of LightGBM: {'f1_micro': 0.9019999999999999, 'f1_macro': 0.659453339505749, 'roc_auc': 0.8512450135095173} 



# Conclusions
- Overall, *LightGBM* is the best performer in this case, followed closely by *Gradient Boosting*.
- In terms of resampling, either no resampling or oversampling seem to be giving the best results (in terms of the highest ROC-AUC score). For the random forest model, it can be clearly observed that oversampling increases the F1-macro (which is an F1 measure which computes the average, unweighted F1-score for all of the classes in the outcome), but reduces the overall performance (see F1-micro and ROC-AUC).