In [6]:
#we first import tools that will be required for this project
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    confusion_matrix
)

In [37]:
data= pd.read_csv("credit_risk_dataset.csv") #this is to rename the dataset 

In [38]:
data_new = data.dropna().reset_index(drop=True) #to drop the data with na/missing values 


In [51]:
data_new.isna().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [57]:
#in this line of code i was defining the loan_status  column and features
X = data_new.drop(columns=["loan_status"])
y = data_new["loan_status"]

In [58]:
# to handle categorical or qualitative variables, we convert those variables to numbers to make the work easier
# However, in our data, it is not necessary to encode the column "loan_status" since it is already a quantitative variable; this step is for the flexibility  of the code so that it can be reproducible 
X= pd.get_dummies(X , drop_first =True)

In [59]:
#this is where  we split the train and test data
X_train, X_test , y_train, y_test = train_test_split( X, y, test_size =0.2 , random_state=42 , stratify=y)
# I have decided to allocate 20% of the data to testing, 80% to training 
#stratify parameter was used to balance the data set and obtain a properly distributed data.

In [60]:
#scaling the numerical values since logistic regression works better with scaling

X_train_scaled = StandardScaler().fit_transform(X_train)
X_test_scaled = StandardScaler().fit_transform(X_test)


In [61]:
#This is where I train the model using logistic regression 
model = LogisticRegression(max_iter = 1000)
model.fit(X_train_scaled, y_train)

In [62]:
print(X.shape, y.shape) 

(28638, 22) (28638,)


In [63]:
#predictions of the model 
y_pred= model.predict(X_test_scaled)
y_probability = model.predict_proba(X_test_scaled)[:, 1]

In [64]:
#This is where i produce the ROC AUC report
# This part determines if my model separates defaulters from non-defaulters well enough 
print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_probability))

#High recall = you catch more risky borrowers

              precision    recall  f1-score   support

           0       0.88      0.95      0.91      4487
           1       0.74      0.54      0.62      1241

    accuracy                           0.86      5728
   macro avg       0.81      0.74      0.77      5728
weighted avg       0.85      0.86      0.85      5728

ROC AUC: 0.8617057747810085


In [65]:
#Producing a confusion matrix to see how well the model performs 
confusion_matrix(y_test, y_pred)

array([[4258,  229],
       [ 577,  664]])

In [68]:
#Risk score 
data_test = X_test.copy()
data_test["default_probability"] = y_probability
data_test[["default_probability"]].head()

#we see that Higher interest rates and higher loan_to_income ratios significantly increase default risk, while longer credit history reduces risk.

Unnamed: 0,default_probability
17294,0.256043
18491,0.021039
4734,0.673339
23099,0.024579
7584,0.01154
