<a href="https://colab.research.google.com/github/njgeorge000158/Credit-Risk-Classification-Using-Scikit-Learn/blob/main/credit_risk_hyperparameters_optimization_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#*******************************************************************************************
 #
 #  File Name:  credit_risk_hyperparameters_optimization_colab.ipynb
 #
 #  File Description:
 #      This interactive Python notebook, credit_risk_hyperparameters_optimization_colab.ipynb,
 #      readsa csv file, spam-data.csv, and uses Python and scikit-learn module to find
 #      the best hyperparameters for supervised learning models (binary classification)
 #      that detects credit defaults. Here is a list of the models:
 #
 #      logistic regression
 #      decision tree
 #      random forest
 #      k-nearest neighbor
 #
 #
 #  Date            Description                             Programmer
 #  ----------      ------------------------------------    ------------------
 #  11/27/2023      Initial Development                     Nicholas J. George
 #
 #******************************************************************************************/

from google.colab import drive
drive.mount('/content/gdrive/')

import sys
sys.path.insert(0,'./gdrive/MyDrive/credit_risk_classification')

import os
os.environ['HV_DOC_HTML'] = 'true'

!apt-get update
!apt install firefox
!pip install -U geckodriver
!pip install -U dataframe_image
!pip install -U selenium
!pip install -U kaleido
!pip install -U hvplot
!pip install -U plotly
!pip install -U panel
!pip install -U bokeh
!pip install -U imblearn

import hvplot
import hvplot.pandas

import pandas as pd

import holoviews as hv
hv.extension('bokeh')

import logx
logx.set_logs_directory_path('./gdrive/MyDrive/credit_risk_classification/logs')
logx.set_images_directory_path('./gdrive/MyDrive/credit_risk_classification/images')
logx.set_resources_directory_path('./gdrive/MyDrive/credit_risk_classification/resources')
logx.set_models_directory_path('./gdrive/MyDrive/credit_risk_classification/models')
logx.create_directory(logx.MODELS_DIRECTORY_PATH)


import pandasx
pandasx.set_google_colab(True)

import classificationsx
import credit_risk_constants

import copy
import pickle

import numpy as np
import pandas as pd

from IPython.display import clear_output

from imblearn.combine import SMOTEENN
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import RandomUnderSampler

from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

pd.options.mode.chained_assignment = None

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).
Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
firef

The script created directory, ./gdrive/MyDrive/credit_risk_classification/models.



In [2]:
CONSTANT_LOCAL_FILE_NAME = 'credit_risk_hyperparameters_optimization_colab.ipynb'


logx.set_log_mode(False)

logx.set_image_mode(False)


logx.begin_program('credit_risk_hyperparameters_optimization_colab')

Program execution begins...



# <br> **Section 1: Extraction and Transformation**

## **1.1: Read the CSV data into a Pandas DataFrame**

In [3]:
data_type_dictionary \
    = {'loan_size': float,
       'interest_rate': float,
       'borrower_income': int,
       'debt_to_income': float,
       'num_of_accounts': int,
       'derogatory_marks': int,
       'total_debt': int,
       'loan_status': int}

lending_dataframe \
    = pd.read_csv(credit_risk_constants.CONSTANT_INPUT_FILE_PATH, dtype = data_type_dictionary)

logx.log_write_object(lending_dataframe)

## **1.2: Display Spam DataFrame**

In [4]:
pandasx.return_formatted_table(lending_dataframe, 'Table 1.2: Lending Data Table')

loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
10700.0,7.67,52800,0.43,5,1,22800,0
8400.0,6.69,43600,0.31,3,0,13600,0
9000.0,6.96,46100,0.35,3,0,16100,0
10700.0,7.66,52700,0.43,5,1,22700,0
10800.0,7.7,53000,0.43,5,1,23000,0
10100.0,7.44,50600,0.41,4,1,20600,0
10300.0,7.49,51100,0.41,4,1,21100,0
8800.0,6.86,45100,0.33,3,0,15100,0
9300.0,7.1,47400,0.37,3,0,17400,0
9700.0,7.25,48800,0.39,4,0,18800,0


## **1.3: Create the labels series (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.**

### **Separate the Y Variable, The Labels**

In [5]:
y_series = lending_dataframe['loan_status']

logx.log_write_object(y_series)

### **Review the Y Series**

In [6]:
pandasx.return_formatted_table(y_series.to_frame(), 'Table 1.3.1: Lending Target Series')

loan_status
0
0
0
0
0
0
0
0
0
0


### **Check the Balance of the Labels Variable (`y`) by Using the `value_counts` Function.**

In [7]:
y_series.value_counts()

loan_status
0    75036
1     2500
Name: count, dtype: int64

### **Separate the X Variable, the Features**

In [8]:
x_dataframe = lending_dataframe.drop(columns = 'loan_status', axis = 1)

logx.log_write_object(x_dataframe)

### **Review the X DataFrame**

In [9]:
pandasx.return_formatted_table(x_dataframe, 'Table 1.3.2: Lending Features DataFrame')

loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
10700.0,7.67,52800,0.43,5,1,22800
8400.0,6.69,43600,0.31,3,0,13600
9000.0,6.96,46100,0.35,3,0,16100
10700.0,7.66,52700,0.43,5,1,22700
10800.0,7.7,53000,0.43,5,1,23000
10100.0,7.44,50600,0.41,4,1,20600
10300.0,7.49,51100,0.41,4,1,21100
8800.0,6.86,45100,0.33,3,0,15100
9300.0,7.1,47400,0.37,3,0,17400
9700.0,7.25,48800,0.39,4,0,18800


## **1.4: Split the Data into Training and Testing Datasets by Using `train_test_split`.**

In [10]:
x_train_dataframe, x_test_dataframe, \
y_train_series, y_test_series \
    = train_test_split \
        (x_dataframe, y_series,
         random_state = credit_risk_constants.CONSTANT_ML_RANDOM_STATE_1)

In [11]:
logx.log_write_object(x_train_dataframe)

logx.log_write_object(x_test_dataframe)

logx.log_write_object(y_train_series)

logx.log_write_object(y_test_series)

## **1.5: Use the StandardScaler to Scale the X Variables**

### **Scale Training and Test Data as Numpy Arrays**

In [12]:
x_train_scaled_nparray = StandardScaler().fit_transform(x_train_dataframe)

logx.log_write_object(x_train_scaled_nparray)

In [13]:
x_test_scaled_nparray = StandardScaler().fit_transform(x_test_dataframe)

logx.log_write_object(x_test_scaled_nparray)

### **Create Scaled X Variable DataFrames**

In [14]:
x_train_scaled_dataframe \
    = pd.DataFrame \
        (x_train_scaled_nparray,
         columns = x_train_dataframe.columns,
         index = x_train_dataframe.index)

logx.log_write_object(x_train_scaled_dataframe)

In [15]:
x_test_scaled_dataframe \
    = pd.DataFrame \
        (x_test_scaled_nparray,
         columns = x_test_dataframe.columns,
         index = x_test_dataframe.index)

logx.log_write_object(x_test_scaled_dataframe)

### **Display Scaled Training and Testing Data**

In [16]:
pandasx.return_formatted_table \
    (x_train_scaled_dataframe,
     'Table 1.5.1: Lending Scaled Features Training Data')

loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
-0.34,-0.33,-0.34,-0.29,-0.43,-0.67,-0.34
-0.57,-0.6,-0.6,-0.68,-0.43,-0.67,-0.6
0.38,0.39,0.39,0.63,0.61,1.04,0.39
-0.57,-0.57,-0.56,-0.63,-0.43,-0.67,-0.56
-1.0,-0.98,-0.98,-1.33,-0.96,-0.67,-0.98
-0.1,-0.11,-0.11,0.02,0.09,-0.67,-0.11
-0.77,-0.78,-0.78,-0.98,-0.96,-0.67,-0.78
-1.05,-1.06,-1.06,-1.49,-0.96,-0.67,-1.06
-0.96,-0.97,-0.97,-1.31,-0.96,-0.67,-0.97
0.19,0.17,0.18,0.38,0.09,1.04,0.18


In [17]:
pandasx.return_formatted_table \
    (x_test_scaled_dataframe,
     'Table 1.5.2: Lending Scaled Features Test Data')

loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
-0.15,-0.13,-0.14,-0.02,0.09,-0.68,-0.14
0.04,0.05,0.04,0.22,0.09,-0.68,0.04
-0.68,-0.66,-0.65,-0.77,-0.44,-0.68,-0.65
-0.97,-0.96,-0.95,-1.29,-0.97,-0.68,-0.95
0.33,0.35,0.36,0.59,0.09,1.04,0.36
0.24,0.25,0.25,0.47,0.09,1.04,0.25
-0.05,-0.05,-0.04,0.11,0.09,-0.68,-0.04
-0.15,-0.17,-0.17,-0.06,0.09,-0.68,-0.17
0.04,0.03,0.03,0.2,0.09,-0.68,0.03
0.33,0.35,0.35,0.58,0.09,1.04,0.35


# <br> **Section 2: Undersampled and OverSampled Spam Data**

## **2.1: Instantiate the Random Undersampler Instance**

In [18]:
x_train_scaled_undersampled_dataframe, y_train_undersampled_series \
    = RandomUnderSampler(random_state = credit_risk_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [19]:
logx.log_write_object(x_train_scaled_undersampled_dataframe)

logx.log_write_object(y_train_undersampled_series)

## **2.2: Instantiate the Random Oversampler Instance**

In [20]:
x_train_scaled_oversampled_dataframe, y_train_oversampled_series \
    = RandomOverSampler(random_state = credit_risk_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [21]:
logx.log_write_object(x_train_scaled_undersampled_dataframe)

logx.log_write_object(y_train_undersampled_series)

## **2.3: Instantiate the Cluster Centroids Instance**

In [22]:
x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series \
    = ClusterCentroids \
        (estimator = KMeans(n_init = 'auto', random_state = credit_risk_constants.CONSTANT_ML_RANDOM_STATE_2),
         random_state = credit_risk_constants.CONSTANT_ML_RANDOM_STATE_1) \
            .fit_resample(x_train_scaled_dataframe, y_train_series)

In [23]:
logx.log_write_object(x_train_scaled_cluster_centroids_dataframe)

logx.log_write_object(y_train_cluster_centroids_series)

## **2.4: Instantiate the SMOTE Instance**

In [24]:
x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series \
    = SMOTE(random_state = credit_risk_constants.CONSTANT_ML_RANDOM_STATE_1, sampling_strategy = 'auto') \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [25]:
logx.log_write_object(x_train_scaled_SMOTE_dataframe)

logx.log_write_object(y_train_SMOTE_series)

## **2.5: Instantiate the SMOTEENN Instance**

In [26]:
x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series \
    = SMOTEENN(random_state = credit_risk_constants.CONSTANT_ML_RANDOM_STATE_1) \
        .fit_resample(x_train_scaled_dataframe, y_train_series)

In [27]:
logx.log_write_object(x_train_scaled_SMOTEENN_dataframe)

logx.log_write_object(y_train_SMOTEENN_series)

## **2.6: Check the Balance of the Labels Variable (`y`) by Using the `value_counts` Function.**

In [28]:
y_train_undersampled_series.value_counts()

loan_status
0    1890
1    1890
Name: count, dtype: int64

In [29]:
y_train_oversampled_series.value_counts()

loan_status
0    56262
1    56262
Name: count, dtype: int64

In [30]:
y_train_cluster_centroids_series.value_counts()

loan_status
0    1890
1    1890
Name: count, dtype: int64

In [31]:
y_train_SMOTE_series.value_counts()

loan_status
0    56262
1    56262
Name: count, dtype: int64

In [32]:
y_train_SMOTEENN_series.value_counts()

loan_status
0    55873
1    54849
Name: count, dtype: int64

## **2.7: Display Normalized Resampled Training and Testing Data**

In [33]:
pandasx.return_formatted_table \
    (x_train_scaled_undersampled_dataframe,
     'Table 2.7.1: Scaled Features Training Undersampled Data')

loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
-0.62,-0.63,-0.63,-0.74,-0.43,-0.67,-0.63
1.05,1.05,1.06,1.3,1.14,1.04,1.06
0.14,0.12,0.12,0.31,0.09,1.04,0.12
0.19,0.21,0.2,0.41,0.09,1.04,0.2
-0.1,-0.1,-0.1,0.04,0.09,-0.67,-0.1
0.71,0.71,0.71,0.97,0.61,1.04,0.71
-0.81,-0.81,-0.81,-1.04,-0.96,-0.67,-0.81
-0.48,-0.49,-0.49,-0.52,-0.43,-0.67,-0.49
-0.62,-0.64,-0.63,-0.74,-0.43,-0.67,-0.63
0.62,0.6,0.61,0.86,0.61,1.04,0.61


In [34]:
pandasx.return_formatted_table \
    (x_train_scaled_oversampled_dataframe,
     'Table 2.7.2: Scaled Features Training Oversampled Data')

loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
-0.34,-0.33,-0.34,-0.29,-0.43,-0.67,-0.34
-0.57,-0.6,-0.6,-0.68,-0.43,-0.67,-0.6
0.38,0.39,0.39,0.63,0.61,1.04,0.39
-0.57,-0.57,-0.56,-0.63,-0.43,-0.67,-0.56
-1.0,-0.98,-0.98,-1.33,-0.96,-0.67,-0.98
-0.1,-0.11,-0.11,0.02,0.09,-0.67,-0.11
-0.77,-0.78,-0.78,-0.98,-0.96,-0.67,-0.78
-1.05,-1.06,-1.06,-1.49,-0.96,-0.67,-1.06
-0.96,-0.97,-0.97,-1.31,-0.96,-0.67,-0.97
0.19,0.17,0.18,0.38,0.09,1.04,0.18


In [35]:
pandasx.return_formatted_table \
    (x_train_scaled_cluster_centroids_dataframe,
     'Table 2.7.3: Scaled Features Training Cluster Centroids Data')

loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0.81,0.8,0.8,1.05,0.61,1.04,0.8
-0.48,-0.48,-0.48,-0.5,-0.43,-0.67,-0.48
-1.1,-1.08,-1.09,-1.53,-0.96,-0.67,-1.09
4.81,4.8,4.8,3.52,4.81,2.76,4.8
0.28,0.3,0.3,0.52,0.09,1.04,0.3
-0.05,-0.05,-0.05,0.1,0.09,-0.67,-0.05
-0.77,-0.75,-0.74,-0.92,-0.96,-0.67,-0.74
-0.29,-0.27,-0.28,-0.21,-0.43,-0.67,-0.28
-1.62,-1.63,-1.62,-2.69,-1.48,-0.67,-1.62
0.38,0.37,0.37,0.6,0.61,1.04,0.37


In [36]:
pandasx.return_formatted_table \
    (x_train_scaled_SMOTE_dataframe,
     'Table 2.7.4: Scaled Features Training SMOTE Data')

loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
-0.34,-0.33,-0.34,-0.29,-0.43,-0.67,-0.34
-0.57,-0.6,-0.6,-0.68,-0.43,-0.67,-0.6
0.38,0.39,0.39,0.63,0.61,1.04,0.39
-0.57,-0.57,-0.56,-0.63,-0.43,-0.67,-0.56
-1.0,-0.98,-0.98,-1.33,-0.96,-0.67,-0.98
-0.1,-0.11,-0.11,0.02,0.09,-0.67,-0.11
-0.77,-0.78,-0.78,-0.98,-0.96,-0.67,-0.78
-1.05,-1.06,-1.06,-1.49,-0.96,-0.67,-1.06
-0.96,-0.97,-0.97,-1.31,-0.96,-0.67,-0.97
0.19,0.17,0.18,0.38,0.09,1.04,0.18


In [37]:
pandasx.return_formatted_table \
    (x_train_scaled_SMOTEENN_dataframe,
     'Table 2.7.5: Scaled Features Training SMOTEENN Data')

loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
-0.34,-0.33,-0.34,-0.29,-0.43,-0.67,-0.34
-0.57,-0.6,-0.6,-0.68,-0.43,-0.67,-0.6
0.38,0.39,0.39,0.63,0.61,1.04,0.39
-0.57,-0.57,-0.56,-0.63,-0.43,-0.67,-0.56
-1.0,-0.98,-0.98,-1.33,-0.96,-0.67,-0.98
-0.1,-0.11,-0.11,0.02,0.09,-0.67,-0.11
-0.77,-0.78,-0.78,-0.98,-0.96,-0.67,-0.78
-1.05,-1.06,-1.06,-1.49,-0.96,-0.67,-1.06
-0.96,-0.97,-0.97,-1.31,-0.96,-0.67,-0.97
0.19,0.17,0.18,0.38,0.09,1.04,0.18


# <br> **Section 3: Model Optimization**

## **3.1: Logistic Regression**

### **Original**

In [38]:
parameters_grid_dictionary \
    = {'class_weight': ['balanced', None],
       'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']}

lr_grid_search_model \
    = GridSearchCV \
        (LogisticRegression \
             (random_state = credit_risk_constants.CONSTANT_ML_RANDOM_STATE_1,
              max_iter = credit_risk_constants.CONSTANT_ML_LR_MAX_ITERATIONS),
         parameters_grid_dictionary)

lr_undersampled_grid_search_model = copy.copy(lr_grid_search_model)

lr_oversampled_grid_search_model = copy.copy(lr_grid_search_model)

lr_cluster_centroids_grid_search_model = copy.copy(lr_grid_search_model)

lr_SMOTE_grid_search_model = copy.copy(lr_grid_search_model)

lr_SMOTEENN_grid_search_model = copy.copy(lr_grid_search_model)

In [39]:
lr_grid_search_model.fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [40]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model best accuracy score is {:.2f}%' \
         .format(lr_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model best accuracy score is 99.44%

The optimal model hyperparameters are:
{'class_weight': None, 'solver': 'lbfgs'}[0m


### **Random Undersampling**

In [41]:
lr_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [42]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model with random undersampling best accuracy score is {:.2f}%' \
         .format(lr_undersampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with random undersampling best accuracy score is 99.47%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'solver': 'lbfgs'}[0m


### **Random Oversampling**

In [43]:
lr_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [44]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model with random oversampling best accuracy score is {:.2f}%' \
         .format(lr_oversampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with random oversampling best accuracy score is 99.43%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'solver': 'liblinear'}[0m


### **Cluster Centroids**

In [45]:
lr_cluster_centroids_grid_search_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [46]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model with cluster centroid best accuracy score is {:.2f}%' \
         .format(lr_cluster_centroids_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_cluster_centroids_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with cluster centroid best accuracy score is 94.31%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'solver': 'liblinear'}[0m


### **SMOTE**

In [47]:
lr_SMOTE_grid_search_model.fit(x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

clear_output()

In [48]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model with SMOTE best accuracy score is {:.2f}%' \
         .format(lr_SMOTE_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_SMOTE_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with SMOTE best accuracy score is 99.44%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'solver': 'liblinear'}[0m


### **SMOTEENN**

In [49]:
lr_SMOTEENN_grid_search_model.fit \
    (x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

clear_output()

In [50]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The logistic regression model with SMOTEENN best accuracy score is {:.2f}%' \
         .format(lr_SMOTEENN_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(lr_SMOTEENN_grid_search_model.best_params_)
     + '\033[0m')

[1mThe logistic regression model with SMOTEENN best accuracy score is 99.80%

The optimal model hyperparameters are:
{'class_weight': None, 'solver': 'liblinear'}[0m


## **3.2: Decision Tree**

### **Original**

In [51]:
parameters_grid_dictionary \
    = {'criterion': ['gini', 'entropy', 'log_loss'],
       'splitter': ['best', 'random'],
       'class_weight': ['balanced', None]}

dt_grid_search_model \
    = GridSearchCV \
        (DecisionTreeClassifier \
            (random_state = credit_risk_constants.CONSTANT_ML_RANDOM_STATE_1),
         parameters_grid_dictionary)

dt_undersampled_grid_search_model = copy.copy(dt_grid_search_model)

dt_oversampled_grid_search_model = copy.copy(dt_grid_search_model)

dt_cluster_centroids_grid_search_model = copy.copy(dt_grid_search_model)

dt_SMOTE_grid_search_model = copy.copy(dt_grid_search_model)

dt_SMOTEENN_grid_search_model = copy.copy(dt_grid_search_model)

In [52]:
dt_grid_search_model.fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [53]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model best accuracy score is {:.2f}%' \
         .format(dt_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model best accuracy score is 99.09%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'gini', 'splitter': 'best'}[0m


### **Random Undersampling**

In [54]:
dt_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [55]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model with random undersampling best accuracy score is {:.2f}%' \
         .format(dt_undersampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with random undersampling best accuracy score is 99.15%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'gini', 'splitter': 'random'}[0m


### **Random Oversampling**

In [56]:
dt_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [57]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model with random oversampling best accuracy score is {:.2f}%' \
         .format(dt_oversampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with random oversampling best accuracy score is 99.53%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'entropy', 'splitter': 'best'}[0m


### **Cluster Centroids**

In [58]:
dt_cluster_centroids_grid_search_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [59]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model with cluster centroid best accuracy score is {:.2f}%' \
         .format(dt_cluster_centroids_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_cluster_centroids_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with cluster centroid best accuracy score is 89.05%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'entropy', 'splitter': 'random'}[0m


### **SMOTE**

In [60]:
dt_SMOTE_grid_search_model.fit(x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

clear_output()

In [61]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model with SMOTE best accuracy score is {:.2f}%' \
         .format(dt_SMOTE_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_SMOTE_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with SMOTE best accuracy score is 99.58%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'gini', 'splitter': 'best'}[0m


### **SMOTEENN**

In [62]:
dt_SMOTEENN_grid_search_model.fit \
    (x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

clear_output()

In [63]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The decision tree model with SMOTEENN best accuracy score is {:.2f}%' \
         .format(dt_SMOTEENN_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(dt_SMOTEENN_grid_search_model.best_params_)
     + '\033[0m')

[1mThe decision tree model with SMOTEENN best accuracy score is 99.99%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'gini', 'splitter': 'best'}[0m


## **3.3: Random Forest**

### **Original**

In [64]:
parameters_grid_dictionary \
    = {'criterion': ['gini', 'entropy', 'log_loss'],
       'class_weight': ['balanced', 'balanced_subsample', None]}

rf_grid_search_model \
    = GridSearchCV \
        (RandomForestClassifier \
             (n_estimators = credit_risk_constants.CONSTANT_ML_RF_N_ESTIMATORS,
              random_state = credit_risk_constants.CONSTANT_ML_RANDOM_STATE_1),
         parameters_grid_dictionary)

rf_undersampled_grid_search_model = copy.copy(rf_grid_search_model)

rf_oversampled_grid_search_model = copy.copy(rf_grid_search_model)

rf_cluster_centroids_grid_search_model = copy.copy(rf_grid_search_model)

rf_SMOTE_grid_search_model = copy.copy(rf_grid_search_model)

rf_SMOTEENN_grid_search_model = copy.copy(rf_grid_search_model)

In [65]:
rf_grid_search_model.fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [66]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model best accuracy score is {:.2f}%' \
         .format(rf_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model best accuracy score is 99.25%

The optimal model hyperparameters are:
{'class_weight': 'balanced_subsample', 'criterion': 'entropy'}[0m


### **Random Undersampling**

In [67]:
rf_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [68]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model with random undersampling best accuracy score is {:.2f}%' \
         .format(rf_undersampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with random undersampling best accuracy score is 99.18%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'gini'}[0m


### **Random Oversampling**

In [69]:
rf_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [70]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model with random oversampling best accuracy score is {:.2f}%' \
         .format(rf_oversampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with random oversampling best accuracy score is 99.53%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'gini'}[0m


### **Cluster Centroids**

In [71]:
rf_cluster_centroids_grid_search_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [72]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model with cluster centroid best accuracy score is {:.2f}%' \
         .format(rf_cluster_centroids_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_cluster_centroids_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with cluster centroid best accuracy score is 90.42%

The optimal model hyperparameters are:
{'class_weight': 'balanced', 'criterion': 'gini'}[0m


### **SMOTE**

In [73]:
rf_SMOTE_grid_search_model.fit(x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

clear_output()

In [74]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model with SMOTE best accuracy score is {:.2f}%' \
         .format(rf_SMOTE_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_SMOTE_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with SMOTE best accuracy score is 99.58%

The optimal model hyperparameters are:
{'class_weight': 'balanced_subsample', 'criterion': 'gini'}[0m


### **SMOTEENN**

In [75]:
rf_SMOTEENN_grid_search_model.fit \
    (x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

clear_output()

In [76]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The random forest model with SMOTEENN best accuracy score is {:.2f}%' \
         .format(rf_SMOTEENN_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(rf_SMOTEENN_grid_search_model.best_params_)
     + '\033[0m')

[1mThe random forest model with SMOTEENN best accuracy score is 99.98%

The optimal model hyperparameters are:
{'class_weight': 'balanced_subsample', 'criterion': 'entropy'}[0m


## **3.4: K-Nearest Neighbor (KNN)**

### **Original**

In [77]:
parameters_grid_dictionary \
    = {'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

knn_grid_search_model \
    = GridSearchCV \
        (KNeighborsClassifier(leaf_size = credit_risk_constants.CONSTANT_ML_KNN_LEAF_SIZE),
         parameters_grid_dictionary)

knn_undersampled_grid_search_model = copy.copy(knn_grid_search_model)

knn_oversampled_grid_search_model = copy.copy(knn_grid_search_model)

knn_cluster_centroids_grid_search_model = copy.copy(knn_grid_search_model)

knn_SMOTE_grid_search_model = copy.copy(knn_grid_search_model)

knn_SMOTEENN_grid_search_model = copy.copy(knn_grid_search_model)

In [78]:
knn_grid_search_model.fit(x_train_scaled_dataframe, y_train_series)

clear_output()

In [79]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model best accuracy score is {:.2f}%' \
         .format(knn_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model best accuracy score is 99.41%

The optimal model hyperparameters are:
{'algorithm': 'auto'}[0m


### **Random Undersampling**

In [80]:
knn_undersampled_grid_search_model.fit \
    (x_train_scaled_undersampled_dataframe, y_train_undersampled_series)

clear_output()

In [81]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model with random undersampling best accuracy score is {:.2f}%' \
         .format(knn_undersampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_undersampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with random undersampling best accuracy score is 99.47%

The optimal model hyperparameters are:
{'algorithm': 'auto'}[0m


### **Random Oversampling**

In [82]:
knn_oversampled_grid_search_model.fit \
    (x_train_scaled_oversampled_dataframe, y_train_oversampled_series)

clear_output()

In [83]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model with random oversampling best accuracy score is {:.2f}%' \
         .format(knn_oversampled_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_oversampled_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with random oversampling best accuracy score is 99.44%

The optimal model hyperparameters are:
{'algorithm': 'auto'}[0m


### **Cluster Centroids**

In [84]:
knn_cluster_centroids_grid_search_model.fit \
    (x_train_scaled_cluster_centroids_dataframe, y_train_cluster_centroids_series)

clear_output()

In [85]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model with cluster centroid best accuracy score is {:.2f}%' \
         .format(knn_cluster_centroids_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_cluster_centroids_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with cluster centroid best accuracy score is 94.37%

The optimal model hyperparameters are:
{'algorithm': 'auto'}[0m


### **SMOTE**

In [86]:
knn_SMOTE_grid_search_model.fit \
    (x_train_scaled_SMOTE_dataframe, y_train_SMOTE_series)

clear_output()

In [87]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model with SMOTE best accuracy score is {:.2f}%' \
         .format(knn_SMOTE_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_SMOTE_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with SMOTE best accuracy score is 99.61%

The optimal model hyperparameters are:
{'algorithm': 'auto'}[0m


### **SMOTEENN**

In [88]:
knn_SMOTEENN_grid_search_model.fit \
    (x_train_scaled_SMOTEENN_dataframe, y_train_SMOTEENN_series)

clear_output()

In [89]:
logx.print_and_log_text \
    ('\033[1m'
     + 'The knn model with SMOTEENN best accuracy score is {:.2f}%' \
         .format(knn_SMOTEENN_grid_search_model.best_score_ * 100)
     + '\n\nThe optimal model hyperparameters are:\n'
     + str(knn_SMOTEENN_grid_search_model.best_params_)
     + '\033[0m')

[1mThe knn model with SMOTEENN best accuracy score is 99.96%

The optimal model hyperparameters are:
{'algorithm': 'auto'}[0m


# <br> **Section 4: Save Grid Search Models To Files**

## **4.1: Logistic Regression**

### **Original**

In [90]:
pickle.dump \
    (lr_grid_search_model,
     open(credit_risk_constants.CONSTANT_LR_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [91]:
pickle.dump \
    (lr_undersampled_grid_search_model,
     open(credit_risk_constants.CONSTANT_LR_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [92]:
pickle.dump \
    (lr_oversampled_grid_search_model,
     open(credit_risk_constants.CONSTANT_LR_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [93]:
pickle.dump \
    (lr_cluster_centroids_grid_search_model,
     open(credit_risk_constants.CONSTANT_LR_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [94]:
pickle.dump \
    (lr_SMOTE_grid_search_model,
     open(credit_risk_constants.CONSTANT_LR_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTEENN**

In [95]:
pickle.dump \
    (lr_SMOTEENN_grid_search_model,
     open(credit_risk_constants.CONSTANT_LR_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

## **4.2: Decision Tree**

### **Original**

In [96]:
pickle.dump \
    (dt_grid_search_model,
     open(credit_risk_constants.CONSTANT_DT_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [97]:
pickle.dump \
    (dt_undersampled_grid_search_model,
     open(credit_risk_constants.CONSTANT_DT_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [98]:
pickle.dump \
    (dt_oversampled_grid_search_model,
     open(credit_risk_constants.CONSTANT_DT_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [99]:
pickle.dump \
    (dt_cluster_centroids_grid_search_model,
     open(credit_risk_constants.CONSTANT_DT_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [100]:
pickle.dump \
    (dt_SMOTE_grid_search_model,
     open(credit_risk_constants.CONSTANT_DT_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTEENN**

In [101]:
pickle.dump \
    (dt_SMOTEENN_grid_search_model,
     open(credit_risk_constants.CONSTANT_DT_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

## **4.3: Random Forest**

### **Original**

In [102]:
pickle.dump \
    (rf_grid_search_model,
     open(credit_risk_constants.CONSTANT_RF_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [103]:
pickle.dump \
    (rf_undersampled_grid_search_model,
     open(credit_risk_constants.CONSTANT_RF_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [104]:
pickle.dump \
    (rf_oversampled_grid_search_model,
     open(credit_risk_constants.CONSTANT_RF_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [105]:
pickle.dump \
    (rf_cluster_centroids_grid_search_model,
     open(credit_risk_constants.CONSTANT_RF_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [106]:
pickle.dump \
    (rf_SMOTE_grid_search_model,
     open(credit_risk_constants.CONSTANT_RF_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTEENN**

In [107]:
pickle.dump \
    (rf_SMOTEENN_grid_search_model,
     open(credit_risk_constants.CONSTANT_RF_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

## **4.4: K-Nearest Neighbor (KNN)**

### **Original**

In [108]:
pickle.dump \
    (knn_grid_search_model,
     open(credit_risk_constants.CONSTANT_KNN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Undersampling**

In [109]:
pickle.dump \
    (knn_undersampled_grid_search_model,
     open(credit_risk_constants.CONSTANT_KNN_UNDERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Random Oversampling**

In [110]:
pickle.dump \
    (knn_oversampled_grid_search_model,
     open(credit_risk_constants.CONSTANT_KNN_OVERSAMPLED_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **Cluster Centroids**

In [111]:
pickle.dump \
    (knn_cluster_centroids_grid_search_model,
     open(credit_risk_constants.CONSTANT_KNN_CLUSTER_CENTROIDS_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTE**

In [112]:
pickle.dump \
    (knn_SMOTE_grid_search_model,
     open(credit_risk_constants.CONSTANT_KNN_SMOTE_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

### **SMOTEENN**

In [113]:
pickle.dump \
    (knn_SMOTEENN_grid_search_model,
     open(credit_risk_constants.CONSTANT_KNN_SMOTEENN_GRID_SEARCH_MODEL_FILE_PATH, 'wb'))

In [114]:
# logx.end_program()