In [1]:
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
"""
outputted:

Loaded 81460 rows and 35 columns from accepted_loans_ml_training
Label distribution (is_default):
is_default
0    71365
1    10095
Name: count, dtype: int64 

Numeric features: ['loan_amnt', 'funded_amnt', 'term_months', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'open_acc', 'total_acc', 'revol_bal', 'revol_util', 'pub_rec_bankruptcies']
Categorical features: ['purpose', 'application_type', 'activity_year', 'action_taken', 'preapproval', 'loan_to_value_ratio', 'total_loan_costs', 'derived_loan_product_type', 'loan_purpose', 'home_ownership', 'verification_status', 'income', 'debt_to_income_ratio', 'applicant_credit_score_type', 'co_applicant_credit_score_type']
Train size: (65168, 31)
Test size: (16292, 31)

=== Training model ===
Training complete.

=== Evaluation on TEST set ===

Classification report:
              precision    recall  f1-score   support

           0       0.88      1.00      0.93     14273
           1       0.57      0.00      0.00      2019

    accuracy                           0.88     16292
   macro avg       0.72      0.50      0.47     16292
weighted avg       0.84      0.88      0.82     16292

Confusion matrix (rows=true, cols=pred):
[[14270     3]
 [ 2015     4]]

y = df["is_default"] -> Dependent variable
X = df.drop(columns=["is_default", "loan_id", "borrower_id", "loan_status"]) -> Independent variables
    -> Note these columns are dropped since they are used to identify entries, DO NOT have impact on actual data processing

"""

In [2]:
from sqlalchemy import create_engine

def load_training_data(db_url = "postgresql+psycopg2:///credit_risk", NAME = str) -> pd.DataFrame:
    engine = create_engine(db_url)
    df = pd.read_sql(f"SELECT * FROM {NAME}", engine)

    print(f"Loaded {df.shape[0]} rows and {df.shape[1]} columns from {NAME}")
    return df


In [None]:
borrowers = load_training_data(NAME="borrowers")
accepted_loans = load_training_data(NAME="accepted_loans")
rejected = load_training_data(NAME="rejected")

In [32]:
#borrowers.head()
borrowers.dtypes

borrower_id                                     int64
annual_inc                                    float64
dti                                           float64
delinq_2yrs                                   float64
fico_range_low                                float64
fico_range_high                               float64
inq_last_6mths                                float64
open_acc                                      float64
total_acc                                     float64
revol_bal                                     float64
revol_util                                    float64
pub_rec_bankruptcies                          float64
home_ownership                                 object
verification_status                            object
income                                        float64
debt_to_income_ratio                          float64
applicant_credit_score_type                    object
co_applicant_credit_score_type                 object
created_at                  

In [43]:
#accepted_loans.head(10)
accepted_loans.dtypes

loan_id                                    int64
borrower_id                                int64
loan_amnt                                float64
funded_amnt                              float64
term_months                                int64
int_rate                                 float64
installment                              float64
loan_status                               object
purpose                                   object
application_type                          object
activity_year                            float64
action_taken                             float64
preapproval                              float64
loan_to_value_ratio                      float64
total_loan_costs                          object
derived_loan_product_type                 object
loan_purpose                              object
created_at                   datetime64[ns, UTC]
dtype: object

In [36]:
valid_accepted_hdma = load_training_data(NAME="valid_accepted_hdma")
#valid_accepted_hdma.head(10)
valid_accepted_hdma.dtypes

Loaded 199803 rows and 14 columns from valid_accepted_hdma


loan_amount                       float64
loan_term                           int64
interest_rate                      object
income                            float64
debt_to_income_ratio               object
applicant_credit_score_type        object
co_applicant_credit_score_type     object
activity_year                       int64
action_taken                        int64
preapproval                         int64
loan_to_value_ratio               float64
total_loan_costs                   object
derived_loan_product_type          object
loan_purpose                       object
dtype: object

In [37]:
valid_accepted_kaggle = load_training_data(NAME="valid_accepted_kaggle")
#valid_accepted_kaggle.head(10)
valid_accepted_kaggle.dtypes

Loaded 82074 rows and 22 columns from valid_accepted_kaggle


id                        int64
loan_amnt               float64
funded_amnt             float64
term_months              object
int_rate                float64
installment             float64
annual_inc              float64
dti                     float64
delinq_2yrs               int64
fico_range_low            int64
fico_range_high           int64
inq_last_6mths            int64
open_acc                  int64
total_acc                 int64
revol_bal               float64
revol_util              float64
pub_rec_bankruptcies      int64
home_ownership           object
verification_status      object
loan_status              object
purpose                  object
application_type         object
dtype: object

In [42]:
rejected.dtypes

application_id                                  int64
dataset_source                                 object
amount_requested                              float64
application_date                               object
loan_title                                     object
dti                                           float64
activity_year                                 float64
action_taken                                  float64
preapproval                                   float64
loan_purpose                                   object
loan_amount                                   float64
loan_term                                     float64
loan_to_value_ratio                           float64
income                                        float64
derived_loan_product_type                      object
applicant_credit_score_type                    object
co_applicant_credit_score_type                 object
denial_reason_1                                object
created_at                  

In [41]:
valid_rejected_hdma = load_training_data(NAME="valid_rejected_hdma")
valid_rejected_hdma.dtypes
valid_rejected_hdma.head(10)

Loaded 186596 rows and 13 columns from valid_rejected_hdma


Unnamed: 0,activity_year,action_taken,preapproval,loan_purpose,loan_amount,loan_term,loan_to_value_ratio,income,debt_to_income_ratio,derived_loan_product_type,applicant_credit_score_type,co_applicant_credit_score_type,denial_reason_1
0,2023,3,2,31,325000.0,144,116.14,73.0,60.0,Conventional:Subordinate Lien,Equifax,No co-applicant,Debt-to-income ratio
1,2023,3,2,32,195000.0,360,80.0,25.0,60.0,FHA:First Lien,Equifax,No co-applicant,Debt-to-income ratio
2,2023,3,2,32,75000.0,276,96.5,0.0,60.0,Conventional:First Lien,Equifax,No co-applicant,Credit history
3,2023,3,2,32,165000.0,360,45.84,26.0,55.0,FHA:First Lien,Equifax,No co-applicant,Debt-to-income ratio
4,2023,3,2,1,135000.0,300,96.5,31.0,55.0,Conventional:First Lien,Equifax,No co-applicant,Debt-to-income ratio
5,2023,3,2,1,375000.0,360,96.5,37.0,60.0,FHA:First Lien,Equifax,No co-applicant,Debt-to-income ratio
6,2023,3,2,32,965000.0,360,112.09,285.0,55.0,VA:First Lien,Equifax,No co-applicant,Collateral
7,2023,3,2,32,175000.0,360,65.96,76.0,60.0,Conventional:First Lien,FICO,Equifax,Credit history
8,2023,3,2,4,55000.0,240,46.06,96.0,24.5,Conventional:Subordinate Lien,Other Model,No co-applicant,Credit application incomplete
9,2023,3,2,2,165000.0,240,75.0,137.0,32.5,Conventional:First Lien,Equifax,No co-applicant,Other


In [40]:
valid_rejected_kaggle = load_training_data(NAME="valid_rejected_kaggle")
valid_rejected_kaggle.dtypes
valid_rejected_kaggle.head(10)

Loaded 138132 rows and 4 columns from valid_rejected_kaggle


Unnamed: 0,Amount Requested,Application Date,Loan Title,Debt-To-Income Ratio
0,5000.0,2017-01-24,other,25.83
1,12000.0,2017-10-10,debt_consolidation,10.08
2,5000.0,2018-07-15,other,21.48
3,12000.0,2015-11-15,other,21.57
4,1000.0,2014-08-30,other,5.58
5,5000.0,2017-12-16,other,0.0
6,15000.0,2016-10-11,debt_consolidation,30.15
7,22000.0,2013-07-05,credit_card,20.37
8,2000.0,2014-10-24,debt_consolidation,0.0
9,26000.0,2018-04-05,debt_consolidation,34.7
