In [1]:
import pandas as pd
from sqlalchemy import create_engine

engine = create_engine("postgresql+psycopg2:///credit_risk")

df = pd.read_sql("SELECT * FROM accepted_loans_ml_training;", engine)

df.head()

Unnamed: 0,loan_id,borrower_id,is_default,loan_amnt,installment,dti,income,term_months,purpose
0,96668860,25130,0,3975.0,143.69,23.67,43000.0,36,major_purchase
1,118207599,61983,0,4800.0,168.81,11.37,80000.0,36,other
2,127053376,33038,0,8400.0,268.85,8.76,50000.0,36,credit_card
3,69307155,80398,0,16000.0,531.36,16.89,179000.0,36,credit_card
4,21700773,10317,0,10000.0,315.17,22.38,31000.0,36,debt_consolidation


In [None]:
income_cap = df["income"].quantile(0.99)
#apply income cap
df["income"] = df["income"].clip(upper=income_cap)
df["income"].describe()
(df["income"] == income_cap).sum() # 866 outliers

np.int64(866)

In [None]:
#drop id cols
ids = ["loan_id", "borrower_id"]

target = "is_default"
num = ["loan_amnt", "installment", "dti", "income", "term_months"]
cat = ["purpose"]

#define x and y
X = df[num + cat]
y = df[target]

X.head(), y.head()

(   loan_amnt  installment    dti    income  term_months             purpose
 0     3975.0       143.69  23.67   43000.0           36      major_purchase
 1     4800.0       168.81  11.37   80000.0           36               other
 2     8400.0       268.85   8.76   50000.0           36         credit_card
 3    16000.0       531.36  16.89  179000.0           36         credit_card
 4    10000.0       315.17  22.38   31000.0           36  debt_consolidation,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: is_default, dtype: int64)

In [None]:
#split data intro training and test sets -- w/ 20% of data to test and the rest to training

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

X_train.shape, X_test.shape

((65168, 6), (16292, 6))

In [None]:
#scale num features, one-hot-encode cats
# applies transformations

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ],
    remainder="drop"
)

In [None]:
#baseline model - logistic regression

#feed data into log reg model

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

log_reg_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", LogisticRegression(max_iter=1000, class_weight="balanced")),
    ]
)

In [None]:
# YOU HAVE TO RUN THIS CELL

log_reg_model.fit(X_train, y_train) #check if it works!!

0,1,2
,steps,"[('preprocess', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [None]:
#eval log reg model

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred = log_reg_model.predict(X_test)
y_proba = log_reg_model.predict_proba(X_test)[:, 1]

print("Accuracy:     ", accuracy_score(y_test, y_pred)) #correctly predicts about 66% of loans overall
print("Precision (1):", precision_score(y_test, y_pred)) #low precision when predicting default (1) -- only right 18% of the time
print("Recall (1):   ", recall_score(y_test, y_pred)) #catches about 50% of actual defaults -- not that great
print("F1 (1):       ", f1_score(y_test, y_pred)) #not great -- misses a lot of defaults + low-precision
print("ROC-AUC:      ", roc_auc_score(y_test, y_proba)) #0.64 -- okayish

Accuracy:      0.6581144119813406
Precision (1): 0.1818670489159649
Recall (1):    0.5027241208519069
F1 (1):        0.26710526315789473
ROC-AUC:       0.6419456555561791


In [37]:
#try random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

rf_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", RandomForestClassifier(
            n_estimators=300,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=42,
            class_weight="balanced_subsample"
        )),
    ]
)

rf_model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
#eval rf

y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]

print("RF Accuracy:     ", accuracy_score(y_test, y_pred_rf)) # high accuracy, but 12% are acc defaults in our df so it could be guessing
print("RF Precision (1):", precision_score(y_test, y_pred_rf)) # right abt predicting defaults 24% of the time
print("RF Recall (1):   ", recall_score(y_test, y_pred_rf)) # only find 1% of defaulst -- not good!! -- rf is conservative bc we have imbalanced classes
print("RF F1 (1):       ", f1_score(y_test, y_pred_rf)) # same thing -- bad at finding defaults
print("RF ROC-AUC:      ", roc_auc_score(y_test, y_proba_rf)) # 0.6 is still not good

RF Accuracy:      0.8733120549963173
RF Precision (1): 0.24719101123595505
RF Recall (1):    0.010896483407627538
RF F1 (1):        0.020872865275142316
RF ROC-AUC:       0.5978372212388393


In [None]:
# lr vs rf -- add to report
lr_vs_rf = pd.DataFrame({
    "model": ["Logistic Regression", "Random Forest"],
    "accuracy": [0.6581, 0.8733],
    "precision_1": [0.1819, 0.2472],
    "recall_1": [0.5027, 0.0109],
    "f1_1": [0.2671, 0.0209],
    "roc_auc": [0.6419, 0.5979],
})
lr_vs_rf

Unnamed: 0,model,accuracy,precision_1,recall_1,f1_1,roc_auc
0,Logistic Regression,0.6581,0.1819,0.5027,0.2671,0.6419
1,Random Forest,0.8733,0.2472,0.0109,0.0209,0.5979


In [58]:
# try gradient boosting 

from sklearn.ensemble import HistGradientBoostingClassifier

gb_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", HistGradientBoostingClassifier(
            max_depth=6,
            learning_rate=0.1,
            max_iter=300,
            random_state=42,
        )),
    ]
)

gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)
y_proba_gb = gb_model.predict_proba(X_test)[:, 1]

print("GB Accuracy:     ", accuracy_score(y_test, y_pred_gb))
print("GB Precision (1):", precision_score(y_test, y_pred_gb)) 
print("GB Recall (1):   ", recall_score(y_test, y_pred_gb)) 
print("GB F1 (1):       ", f1_score(y_test, y_pred_gb)) 
print("GB ROC-AUC:      ", roc_auc_score(y_test, y_proba_gb)) 

GB Accuracy:      0.8758900073655782
GB Precision (1): 0.2
GB Recall (1):    0.0004952947003467063
GB F1 (1):        0.0009881422924901185
GB ROC-AUC:       0.6568049997385241


In [None]:
# try gradient boosting - weighted

from sklearn.ensemble import HistGradientBoostingClassifier

gb_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", HistGradientBoostingClassifier(
            max_depth=6,
            learning_rate=0.1,
            max_iter=300,
            random_state=42,
            class_weight={0:1, 1:10}   # added to make it less imbalanced
        )),
    ]
)

gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)
y_proba_gb = gb_model.predict_proba(X_test)[:, 1]

print("GB Accuracy:     ", accuracy_score(y_test, y_pred_gb)) #not useful
print("GB Precision (1):", precision_score(y_test, y_pred_gb)) #correct 15.96% of the time, flags many loans as risky, but only some default
print("GB Recall (1):   ", recall_score(y_test, y_pred_gb)) #finds 78.85% of all real defaults -- way better than other models -- most important metric
print("GB F1 (1):       ", f1_score(y_test, y_pred_gb)) # catches defaults, but also over-flags many non-default loans
print("GB ROC-AUC:      ", roc_auc_score(y_test, y_proba_gb)) # better than the otehrs

GB Accuracy:      0.4593665602749816
GB Precision (1): 0.15963100371001704
GB Recall (1):    0.7885091629519564
GB F1 (1):        0.26551034022681785
GB ROC-AUC:       0.6568134148555167


In [57]:
# gb fix threshold

#try
thres = [0.5, 0.4, 0.3, 0.2, 0.1]

rows = []
for t in thres:
    y_pred_thr = (y_proba_gb >= t).astype(int)
    rows.append({
        "threshold": t,
        "f1_1": f1_score(y_test, y_pred_thr),
    })

res = pd.DataFrame(rows)
res

# 0.5 is best so dont change

Unnamed: 0,threshold,f1_1
0,0.5,0.26551
1,0.4,0.246116
2,0.3,0.22959
3,0.2,0.222112
4,0.1,0.220776


In [59]:
# lr v.s rf v.s gb

versus = pd.DataFrame({
    "model": ["Logistic Regression","Random Forest","Gradient Boosting (unweighted)","Gradient Boosting (weighted)"],
    "accuracy": [0.6581,0.8733,0.8758,0.4593],
    "precision_1": [0.1819,0.2472,0.2,0.1596],
    "recall_1": [0.5027,0.0109,0.0004952,0.7885],
    "f1_1": [0.2671,0.0209,0.0009881,0.2655],
    "roc_auc": [0.6419,0.5979,0.6568,0.6568]
})

versus


Unnamed: 0,model,accuracy,precision_1,recall_1,f1_1,roc_auc
0,Logistic Regression,0.6581,0.1819,0.5027,0.2671,0.6419
1,Random Forest,0.8733,0.2472,0.0109,0.0209,0.5979
2,Gradient Boosting (unweighted),0.8758,0.2,0.000495,0.000988,0.6568
3,Gradient Boosting (weighted),0.4593,0.1596,0.7885,0.2655,0.6568


In [None]:
# df["is_default"].value_counts()

is_default
0    71365
1    10095
Name: count, dtype: int64

In [None]:
# id_cols = ["loan_id", "borrower_id"]
# target_col = "is_default"

# numeric_features = [
#     "loan_amnt",
#     "installment",
#     "dti",
#     "income",
#     "term_months"
# ]

# categorical_features = [
#     "purpose"
# ]

In [None]:
#df[target_col].isna().sum()

np.int64(0)

In [None]:
#df[numeric_features + categorical_features].isna().sum() # check missing vals
# NO MISSING VALS!!

loan_amnt      0
installment    0
dti            0
income         0
term_months    0
purpose        0
dtype: int64

In [None]:
#check income
# df["income"].describe()

# df[df["income"] > 500000].head()

#(df["income"] > 200000).sum()
#(df["income"] > 300000).sum()
#(df["income"] > 400000).sum()


np.int64(1005)

In [None]:
#df[df["loan_to_value_ratio"].notna()].head()
#df["loan_to_value_ratio"].notna().sum()
#df[df["loan_purpose"].notna()].head()
#df["loan_purpose"].notna().sum()

Unnamed: 0,loan_id,borrower_id,is_default,loan_amnt,installment,dti,loan_to_value_ratio,income,term_months,purpose,loan_purpose,derived_loan_product_type


In [None]:
#pd.read_sql("SELECT COUNT(*) FROM Accepted_Loans;", engine)
#pd.read_sql("SELECT COUNT(*) FROM Accepted_Loans WHERE loan_to_value_ratio IS NOT NULL;",engine)

Unnamed: 0,count
0,281877


In [2]:
df.head()

Unnamed: 0,loan_id,borrower_id,is_default,loan_amnt,installment,dti,loan_to_value_ratio,income,term_months,purpose,loan_purpose,derived_loan_product_type
0,21700773,10317,0,10000.0,315.17,22.38,,,36,debt_consolidation,,
1,20358845,18380,1,5200.0,193.79,14.28,,,36,debt_consolidation,,
2,72706075,2942,0,6500.0,199.19,10.69,,,36,debt_consolidation,,
3,42494503,46587,0,20000.0,447.83,21.82,,,60,credit_card,,
4,133947865,37247,0,2100.0,64.07,19.41,,,36,major_purchase,,
