In [1]:
#*******************************************************************************************
 #
 #  File Name:  CreditRiskClassification.ipynb
 #
 #  File Description:
 #      This interactive Python notebook, CreditRiskClassification.ipynb, reads a csv file,
 #      LendingData.csv, in the Resources folder, and uses Python and supervised learning
 #      methods to use a dataset of historical lending activity from a peer-to-peer lending
 #      lending services company to build a model that can identify the creditworthiness
 #      of borrowers. 
 #
 #      I reinstalled the scikit-learn module using the following commands to allow the
 #      RandomOverSampler function to work:
 #
 #      pip3 uninstall scikit-learn
 #      pip3 install scikit-learn==1.2.2
 #
 #
 #  Date            Description                             Programmer
 #  ----------      ------------------------------------    ------------------
 #  11/25/2023      Initial Development                     N. James George
 #
 #******************************************************************************************/

import CreditRiskClassificationFunctions as local_function

import PyConstants as constant
import PyFunctions as function
import PyLogConstants as log_constant
import PyLogFunctions as log_function
import PyLogSubRoutines as log_subroutine
import PySubRoutines as subroutine

import numpy as np
import pandas as pd

from pathlib import Path

from imblearn.over_sampling import RandomOverSampler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [2]:
CONSTANT_LOCAL_FILE_NAME \
    = 'CreditRiskClassification.ipynb'

CONSTANT_LENDING_DATA_CSV_FILE_PATH \
    = './Resources/LendingData.csv'

log_subroutine \
    .SetLogMode \
        (False)

log_subroutine \
    .SetDebugMode \
        (False)

log_subroutine \
    .SetImageMode \
        (False)

log_subroutine \
    .BeginProgramExecution \
        ('CreditRiskClassification')

# <br> **Section 1: Split the Data into Training and Testing Sets**

## **1.1: Read the CSV data from the `Resources` folder into a Pandas DataFrame**

### **Read the CSV data from the `Resources` folder**

In [3]:
lendingDataFrame \
    = function. \
        ReturnCSVFileAsDataFrame \
            (CONSTANT_LENDING_DATA_CSV_FILE_PATH)


log_function \
    .DebugReturnObjectWriteObject \
        (lendingDataFrame)

### **Display Lending DataFrame**

In [4]:
captionString \
    = 'Table 1.2.1: Lending Data Table'

currentStylerObject \
    = function \
        .ReturnStylerObjectStandardFormat \
            (lendingDataFrame.head(12),
             captionString) \
        .format({'loan_size': '${:,.0f}',
                 'interest_rate': '{:.2f}%',
                 'borrower_income': '${:,.0f}',
                 'total_debt': '${:,.0f}'})

log_function \
    .ReturnStylerObjectSavePNGImage \
        (currentStylerObject,
         captionString)

loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
"$10,700",7.67%,"$52,800",0.431818,5,1,"$22,800",0
"$8,400",6.69%,"$43,600",0.311927,3,0,"$13,600",0
"$9,000",6.96%,"$46,100",0.349241,3,0,"$16,100",0
"$10,700",7.66%,"$52,700",0.43074,5,1,"$22,700",0
"$10,800",7.70%,"$53,000",0.433962,5,1,"$23,000",0
"$10,100",7.44%,"$50,600",0.407115,4,1,"$20,600",0
"$10,300",7.49%,"$51,100",0.412916,4,1,"$21,100",0
"$8,800",6.86%,"$45,100",0.334812,3,0,"$15,100",0
"$9,300",7.10%,"$47,400",0.367089,3,0,"$17,400",0
"$9,700",7.25%,"$48,800",0.385246,4,0,"$18,800",0


## **1.2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.**

### **Separate the y variable, the labels**

In [5]:
ySeries \
    = lendingDataFrame \
        ['loan_status']

log_function \
    .DebugReturnObjectWriteObject \
        (ySeries)

### **Separate the X variable, the features**

In [6]:
xDataFrame \
    = lendingDataFrame \
        .drop \
            (columns = 'loan_status',
             axis = 1)

log_function \
    .DebugReturnObjectWriteObject \
        (xDataFrame)

### **Review the Y Variable Series**

In [7]:
captionString \
    = 'Table 1.3.1: Y Variable Series'

currentStylerObject \
    = function \
        .ReturnStylerObjectStandardFormat \
            (ySeries.to_frame().head(12),
             captionString)

log_function \
    .ReturnStylerObjectSavePNGImage \
        (currentStylerObject,
         captionString)

loan_status
0
0
0
0
0
0
0
0
0
0


### **Review the X Variable DataFrame**

In [8]:
captionString \
    = 'Table 1.3.2: X Variable DataFrame'

currentStylerObject \
    = function \
        .ReturnStylerObjectStandardFormat \
            (xDataFrame.head(12),
             captionString) \
        .format({'loan_size': '${:,.0f}',
                 'interest_rate': '{:.2f}%',
                 'borrower_income': '${:,.0f}',
                 'total_debt': '${:,.0f}'})

log_function \
    .ReturnStylerObjectSavePNGImage \
        (currentStylerObject,
         captionString)

loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
"$10,700",7.67%,"$52,800",0.431818,5,1,"$22,800"
"$8,400",6.69%,"$43,600",0.311927,3,0,"$13,600"
"$9,000",6.96%,"$46,100",0.349241,3,0,"$16,100"
"$10,700",7.66%,"$52,700",0.43074,5,1,"$22,700"
"$10,800",7.70%,"$53,000",0.433962,5,1,"$23,000"
"$10,100",7.44%,"$50,600",0.407115,4,1,"$20,600"
"$10,300",7.49%,"$51,100",0.412916,4,1,"$21,100"
"$8,800",6.86%,"$45,100",0.334812,3,0,"$15,100"
"$9,300",7.10%,"$47,400",0.367089,3,0,"$17,400"
"$9,700",7.25%,"$48,800",0.385246,4,0,"$18,800"


## **1.3: Check the balance of the labels variable (`y`) by using the `value_counts` function.**

In [9]:
ySeries.value_counts()

loan_status
0    75036
1     2500
Name: count, dtype: int64

## **1.4: Split the data into training and testing datasets by using `train_test_split`.**

In [10]:
xTrainDataFrame, \
xTestDataFrame, \
yTrainSeries, \
yTestSeries \
    = train_test_split \
        (xDataFrame, 
         ySeries, 
         random_state = 1)

In [11]:
log_function \
    .DebugReturnObjectWriteObject \
        (xTrainDataFrame)

In [12]:
log_function \
    .DebugReturnObjectWriteObject \
        (xTestDataFrame)

In [13]:
log_function \
    .DebugReturnObjectWriteObject \
        (yTrainSeries)

In [14]:
log_function \
    .DebugReturnObjectWriteObject \
        (yTestSeries)

# <br> **Section 2: Create a Logistic Regression Model with the Original Data**

## **2.1: Fit a logistic regression model by using the training data.**

In [15]:
# This line of code assigns a random_state parameter of 1 to the model
# and fits the model using training data.
modelLogisticRegression \
    = LogisticRegression \
        (random_state = 1) \
            .fit \
                (xTrainDataFrame, 
                 yTrainSeries)

log_function \
    .DebugReturnObjectWriteObject \
        (modelLogisticRegression)

## **2.2: Save the predictions on the testing data labels by using the testing feature data and the fitted model.**

In [16]:
predictionsNumpyArray \
    = modelLogisticRegression \
        .predict \
            (xTestDataFrame)

log_function \
    .DebugReturnObjectWriteObject \
        (predictionsNumpyArray)

In [17]:
predictionDictionary \
    = {'Prediction': predictionsNumpyArray, 
       'Actual': yTestSeries}

predictionDataFrame \
    = pd.DataFrame \
        (predictionDictionary)

log_function \
    .DebugReturnObjectWriteObject \
        (predictionDataFrame)

In [18]:
captionString \
    = 'Table 2.2.1: Testing Data Label Predictions (Original)'

currentStylerObject \
    = function \
        .ReturnStylerObjectStandardFormat \
            (predictionDataFrame.head(12),
             captionString)

log_function \
    .ReturnStylerObjectSavePNGImage \
        (currentStylerObject,
         captionString)

Prediction,Actual
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0


## **2.3: Evaluate the model’s performance**

In [19]:
accuracyScoreOriginalFloat, \
confusionMatrixOriginalDataFrame, \
classificationReportOriginalString \
    = local_function \
        .ModelPerformanceEvaluatorFunction \
            (yTestSeries,
             predictionsNumpyArray)

[1mLOGISTIC REGRESSION MODEL (Original)
[0m
1) [1mAccuracy Score: [0m95.2%

2) [1mConfusion Matrix:
[0m
                  Predicted Healthy  Predicted High-Risk
Actual Healthy                18663                  102
Actual High-Risk                 56                  563

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

     healthy       1.00      0.99      1.00     18765
   high risk       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384




## **Question:** 
### How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

## **Answer:**
### This logistic regression model does an excellent job predicting healthy loans with a small number of false positives and negatives leading to a precision score of 100%, a recall score of 99%, and an f1-score of 100%.  Nevertheless, this model less accurately predicts high-risk loans with a precision of 85%, a recall of 91%, and an f1-score of 88%. The balanced accuracy, 99%, is higher than the actual accuracy, 95%, because of the significant discrepancy in label value counts, 75,036 vs. 2,500. The model's potential for an increase in accuracy and the comparatively inadequate performance in predicting high-risk loans vs health loans are concerning. Thus, the model warrants further optimization either by closing the value count gap with additional data or random oversampling.

# <br> **Section 3: Predict a Logistic Regression Model with Resampled Training Data**

## **3.1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points.**

### **Instantiate the Random Oversampler Model**

In [20]:
xResampledDataFrame, \
yResampledSeries \
    = RandomOverSampler \
        (random_state = 1) \
            .fit_resample \
                (xTrainDataFrame, 
                 yTrainSeries)

In [21]:
log_function \
    .DebugReturnObjectWriteObject \
        (xResampledDataFrame)

In [22]:
log_function \
    .DebugReturnObjectWriteObject \
        (yResampledSeries)

### **Count the distinct values of the resampled labels data**

In [23]:
yResampledSeries.value_counts()

loan_status
0    56271
1    56271
Name: count, dtype: int64

## **3.2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.**

### **Fit a logistic regression model by using the training data.**

In [24]:
# This line of code assigns a random_state parameter of 1 to the model
# and fits the model using training data.
modelResampledLogisticRegression \
    = LogisticRegression \
        (random_state = 1) \
            .fit \
                (xResampledDataFrame, 
                 yResampledSeries)

log_function \
    .DebugReturnObjectWriteObject \
        (modelResampledLogisticRegression)

### **Save the predictions on the testing data labels by using the testing feature data and the fitted model.**

In [25]:
predictionsResampledNumpyArray \
    = modelResampledLogisticRegression \
        .predict \
            (xTestDataFrame)

log_function \
    .DebugReturnObjectWriteObject \
        (predictionsResampledNumpyArray)

In [26]:
predictionResampledDictionary \
    = {'Prediction': predictionsResampledNumpyArray, 
       'Actual': yTestSeries}

predictionResampledDataFrame \
    = pd.DataFrame \
        (predictionResampledDictionary) \
            .reset_index \
                (drop = True)

log_function \
    .DebugReturnObjectWriteObject \
        (predictionResampledDataFrame)

In [27]:
captionString \
    = 'Table 2.2.1: Testing Data Label Predictions (Resampled)'

currentStylerObject \
    = function \
        .ReturnStylerObjectStandardFormat \
            (predictionResampledDataFrame.head(12),
             captionString)

log_function \
    .ReturnStylerObjectSavePNGImage \
        (currentStylerObject,
         captionString)

Prediction,Actual
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0
0,0


## **3.3: Evaluate the model’s performance with resampled data.**

In [28]:
accuracyScoreOriginalFloat, \
confusionMatrixOriginalDataFrame, \
classificationReportOriginalString \
    = local_function \
        .ModelPerformanceEvaluatorFunction \
            (yTestSeries,
             predictionsResampledNumpyArray,
             False)

[1mLOGISTIC REGRESSION MODEL (Random Oversampling)
[0m
1) [1mAccuracy Score: [0m99.4%

2) [1mConfusion Matrix:
[0m
                  Predicted Healthy  Predicted High-Risk
Actual Healthy                18649                  116
Actual High-Risk                  4                  615

3) [1mClassification Report:
[0m
              precision    recall  f1-score   support

     healthy       1.00      0.99      1.00     18765
   high risk       0.84      0.99      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384




## **Question:** 
### How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

## **Answer:**

### In terms of accuracy, this logistic regression model with random oversampling matches the first model for predicting healthy loans and outperforms it for high-risk loans. For instance, the number of accepted healthy loans falls (18,663 to 18,649); the number of rejected high-risk loans expands (563 to 615); the number of false positives increases slightly (102 to 116); and the number of false negatives significantly drops (56 to 4). Moreover, using random oversampling to generate additional synthetic samples for the minority class eliminates the label value count discrepancy leading to, among other things, the balanced accuracy score matching the overall accuracy score, 99%. For healthy loans, both models have 100% precision, 99% recall, and 100% f1-scores; for high-risk loans, although the precision, 85%, declines by 1% to 84%, the recall, 92%, increases by 8% to 99%, and the f1-score, 88%, increases by 3% to 91%. Consequently, using random oversampling with the logistic regression model maintains its identification of healthy loans while improving its identification of high-risk loans.

In [29]:
#log_subroutine \
#    .EndProgramExecution()