# **Importing Libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Credit Risk Analysis

In [2]:
loan = pd.read_csv('Task 3 and 4_Loan_Data.csv')
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 625.1 KB


**Train Test split**

*The task assumes the Recovery rate at 10%.
Split the data into train, test sets. The data contains 10000 entries with 8* *columns, for such size of data, there are two validation method:*
1. Train/validation/test split
2. K-fold cross validation
*I choose to use k-fold cross validation instead of a validation set in order to lose data to a validation set.*

In [3]:
recoverRate = 0.1

In [4]:
#1. spliting the data into train and test data: using 60/40 -- 60% training & 40%
X = loan.drop(['default'], axis=1)
y = loan['default'] # Target variable: default (1 if defaulted, 0 otherwise)

#spliting data into training and testing sets
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size= 0.4, random_state=34)

**Standardization**

*Purpose of standardization:*

1. Feature standardization:

    mean centering

    scaling to unit variance
2. Improving algo performance
3. Equal contribution with each feature

In [5]:
scaler = StandardScaler()
xtrain_scaled = scaler.fit_transform(Xtrain)
xtest_scaled = scaler.transform(Xtest)

In [6]:
print(xtrain_scaled) #scaled training data

[[ 0.99575889 -0.25665542 -0.45989617 ... -0.09892904  0.29460452
   0.42742093]
 [ 1.34208885 -0.82976019  0.45617466 ...  0.68214917 -0.3439891
   1.10305064]
 [-0.4741853  -0.25665542  0.81942656 ...  0.73755359  0.29460452
   0.92178413]
 ...
 [-1.42944407 -0.82976019 -1.36912535 ... -1.75157641 -0.3439891
  -0.74257198]
 [-1.5324172  -0.25665542  1.15293777 ...  1.40989687  0.93319814
   0.32854829]
 [-1.3045757  -0.82976019  0.06813382 ...  0.39881893  0.93319814
   1.69628648]]


**Random Forest**

In [7]:
# try random forest classifier to predict default
rfClassifier = RandomForestClassifier(random_state=34)
rfClassifier.fit(xtrain_scaled, ytrain)

# predicting default probabilities on the test set
yPredictP_rf = rfClassifier.predict_proba(xtest_scaled)[:,1]

# Evaluating the model's performance using ROC-AUC score
roc_auc_rf = roc_auc_score(ytest, yPredictP_rf)

In [8]:
roc_auc_rf

0.9997592212127956

*ROC_AUC score is exceptional for the Random Forest Classifier, where the ROC_AUC ranges from 0 to 1.*

**Decision Tree**

In [9]:
# try a different method: Decision Tree
from sklearn.tree import DecisionTreeClassifier

In [10]:
#training the decision tree classifier
dtClassifier = DecisionTreeClassifier(random_state=34)
dtClassifier.fit(xtrain_scaled, ytrain)

# predicting default prob on test set
yPredictP_dt = dtClassifier.predict_proba(xtest_scaled)[:,1]

#evaluating the model's performance using ROC_AUC socre
roc_auc_dt = roc_auc_score(ytest, yPredictP_dt)
roc_auc_dt

0.9900602418279679

**Gradient Boosting**

In [11]:
# trying gradient boosting
from sklearn.ensemble import GradientBoostingClassifier

# training
gbClassifier = GradientBoostingClassifier(random_state=34)
gbClassifier.fit(xtrain_scaled, ytrain)

# predicting
yPredictP_gb = gbClassifier.predict_proba(xtest_scaled)[:,1]

#evaluation
roc_auc_gb = roc_auc_score(ytest, yPredictP_gb)
roc_auc_gb

0.9997618851483137

**XGBoost**

In [12]:
import xgboost as xgb
xgbClassifier = xgb.XGBClassifier(
    objective = 'binary:logistic',
    eval_metric = 'auc',
    use_label_encoder = False, # To avoid a deprecation warning
    random_state = 34
 )

xgbClassifier.fit(xtrain_scaled, ytrain)

Parameters: { "use_label_encoder" } are not used.



In [13]:
roc_auc_xgb = roc_auc_score(ytest, xgbClassifier.predict_proba(xtest_scaled)[:, 1])

roc_auc_xgb

0.9999213114431604

**Cross Validation**

In [14]:
# cross-validation to detect the overfitting and a more robust estimate of a model's performance
from sklearn.model_selection import cross_val_score

# cross validation for Random Forest
cv_rf = cross_val_score(rfClassifier, xtrain_scaled, ytrain, cv=5, scoring='roc_auc')
print(f"Random Forest - CV Mean: {cv_rf.mean()}, CV Std: {cv_rf.std()}")

Random Forest - CV Mean: 0.9996887755102042, CV Std: 0.00022858185679652617


In [15]:
# cross validation for Decision Tree
cv_dt = cross_val_score(dtClassifier, xtrain_scaled, ytrain, cv=5, scoring='roc_auc')
print(f"Decision Tree - DT Mean: {cv_dt.mean()}, DT Std: {cv_dt.std()}")

Decision Tree - DT Mean: 0.9879591836734694, DT Std: 0.005336217958534341


In [16]:
# cross validation for Gradient Boosting
cv_gb = cross_val_score(gbClassifier, xtrain_scaled, ytrain, cv=5, scoring='roc_auc')
print(f"Gradient Boosting - GB Mean: {cv_gb.mean()}, GB Std: {cv_gb.std()}")

Gradient Boosting - GB Mean: 0.9996971243042673, GB Std: 0.0002576679857541509


In [17]:
# cross validation for XGBoost
cv_xgb = cross_val_score(xgbClassifier, xtrain_scaled, ytrain, cv=5, scoring='roc_auc')
print(f"XGBoosting - XGB Mean: {cv_xgb.mean()}, XGB Std: {cv_xgb.std()}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



XGBoosting - XGB Mean: 0.9998293135435992, XGB Std: 0.0001677830418796913


Parameters: { "use_label_encoder" } are not used.



*The cross validation shows that both random forest and gradient boosting hold pretty good result, whose results are very close to 1. Although the random* *forest would give a faster prediction speed, and gradient boosting for the better performance than random forest, the XGBoost would provide a better* *performance and a faster speed because of its optimiztion and parallel processing.*

**Expected Loss**

In [18]:
def predictDefaultP(loanFestures, model, scaler):
    featureScaled = scaler.transform(loanFestures)

    defaultP = model.predict_proba(featureScaled)[:,1]

    return defaultP

In [19]:
prob_default_test = predictDefaultP(Xtest, xgbClassifier, scaler)

Xtest['Prob of default'] = prob_default_test
prob_default_train = predictDefaultP(Xtrain, xgbClassifier,scaler)
Xtrain['Prob of default'] = prob_default_train

def Eloss(DefaultP, loanAmt, recoveryRate = 0.1):

    return DefaultP * loanAmt * (1-recoveryRate)

expected_loss_train = Eloss(prob_default_train, Xtrain['loan_amt_outstanding'].values[0])
Xtrain['expected_loss'] = expected_loss_train
Xtrain

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,Prob of default,expected_loss
7803,7291395,1,3514.373898,5534.786569,68134.16335,5,663,0.000004,0.012883
2912,8086988,0,4808.997289,846.620712,83878.82050,4,704,0.000002,0.005588
8244,3914623,1,5322.357599,10089.678410,84995.64043,5,693,0.000002,0.007766
3695,3469901,1,3808.427293,6372.428393,63076.46622,7,656,0.000002,0.005415
2629,1337395,5,4271.314690,22756.281030,83475.30929,4,438,0.999985,3162.888428
...,...,...,...,...,...,...,...,...,...
5667,4437459,0,1712.912777,1209.698471,29131.62730,4,605,0.000062,0.196178
324,8099678,2,3973.657805,9369.089493,63687.96336,5,725,0.000007,0.023369
3157,1720192,0,2229.419378,3043.967766,34820.76913,4,592,0.000079,0.249983
5993,1483641,1,5793.687316,9207.734040,98548.46301,6,657,0.000002,0.006388


In [21]:
expected_loss_test = Eloss(prob_default_test, Xtest['loan_amt_outstanding'].values[0])
Xtest['expected_loss'] = expected_loss_test
Xtest

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,Prob of default,expected_loss
2390,4254020,1,3033.502993,5459.760802,58145.40500,5,598,0.000012,0.033632
6803,4153925,1,5131.838746,10273.686330,86735.09997,4,575,0.000004,0.011986
3226,7240749,0,1599.932207,1077.329553,26191.38070,2,697,0.000355,0.968765
8630,1771847,0,3939.469863,3022.037280,59835.22980,3,625,0.000015,0.041654
4898,6631761,0,3857.340486,3290.320625,77903.38542,3,634,0.000006,0.017426
...,...,...,...,...,...,...,...,...,...
3506,2397436,3,3669.635064,14239.425930,74670.59076,4,655,0.000358,0.978386
99,1638898,1,3707.461716,4634.160556,57597.45492,5,633,0.000010,0.028008
4431,3679461,0,3286.939510,7242.783127,64700.55843,3,613,0.000008,0.022651
2212,8660570,2,5606.289442,13883.832690,91723.19680,7,657,0.000006,0.015714


**Predicting Probability of Default**

In [22]:
# quantization of FICO score: split FICO scores into bins
def quantizeFICO(fico):
    if 300 <= fico <= 579:
        return 'Poor'
    elif 580 <= fico <= 669:
        return 'Fair'
    elif 670 <= fico <= 739:
        return 'Good'
    elif 740 <= fico <= 799:
        return 'Very good'
    elif 800 <= fico <= 850:
        return 'Excellent'
    else:
        return 'invalid'

In [23]:
# apply quantization to the data:
loan['fico_category'] = loan['fico_score'].apply(quantizeFICO)

In [24]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
 8   fico_category             10000 non-null  object 
dtypes: float64(3), int64(5), object(1)
memory usage: 703.3+ KB


In [25]:
Xtest['fico_category'] = Xtest['fico_score'].apply(quantizeFICO)
Xtest

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,Prob of default,expected_loss,fico_category
2390,4254020,1,3033.502993,5459.760802,58145.40500,5,598,0.000012,0.033632,Fair
6803,4153925,1,5131.838746,10273.686330,86735.09997,4,575,0.000004,0.011986,Poor
3226,7240749,0,1599.932207,1077.329553,26191.38070,2,697,0.000355,0.968765,Good
8630,1771847,0,3939.469863,3022.037280,59835.22980,3,625,0.000015,0.041654,Fair
4898,6631761,0,3857.340486,3290.320625,77903.38542,3,634,0.000006,0.017426,Fair
...,...,...,...,...,...,...,...,...,...,...
3506,2397436,3,3669.635064,14239.425930,74670.59076,4,655,0.000358,0.978386,Fair
99,1638898,1,3707.461716,4634.160556,57597.45492,5,633,0.000010,0.028008,Fair
4431,3679461,0,3286.939510,7242.783127,64700.55843,3,613,0.000008,0.022651,Fair
2212,8660570,2,5606.289442,13883.832690,91723.19680,7,657,0.000006,0.015714,Fair


In [26]:
Xtrain['fico_category'] = Xtrain['fico_score'].apply(quantizeFICO)
Xtrain

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,Prob of default,expected_loss,fico_category
7803,7291395,1,3514.373898,5534.786569,68134.16335,5,663,0.000004,0.012883,Fair
2912,8086988,0,4808.997289,846.620712,83878.82050,4,704,0.000002,0.005588,Good
8244,3914623,1,5322.357599,10089.678410,84995.64043,5,693,0.000002,0.007766,Good
3695,3469901,1,3808.427293,6372.428393,63076.46622,7,656,0.000002,0.005415,Fair
2629,1337395,5,4271.314690,22756.281030,83475.30929,4,438,0.999985,3162.888428,Poor
...,...,...,...,...,...,...,...,...,...,...
5667,4437459,0,1712.912777,1209.698471,29131.62730,4,605,0.000062,0.196178,Fair
324,8099678,2,3973.657805,9369.089493,63687.96336,5,725,0.000007,0.023369,Good
3157,1720192,0,2229.419378,3043.967766,34820.76913,4,592,0.000079,0.249983,Fair
5993,1483641,1,5793.687316,9207.734040,98548.46301,6,657,0.000002,0.006388,Fair


In [27]:
from scipy.optimize import minimize

#log likelihood function
def neg_LogLikelihood(rep, bucket):
    mu = rep
    sigma = bucket['Prob of default'].std()
    logLikelihood = -0.5 * np.sum(((bucket['Prob of default'] - mu) ** 2 )/(sigma ** 2)) - len(bucket) * np.log(sigma * np.sqrt(2 * np.pi))
    return -logLikelihood

In [28]:
# Labels of FICO socres
labels = ['Poor', 'Fair', 'Good', 'Very good', 'Excellent']
repValues = {}
for dataset in [Xtest, Xtrain]:
    for label in labels:
        bucket = dataset[dataset['fico_category'] == label]

        if bucket.empty:
            continue

        result = minimize(neg_LogLikelihood, x0= np.mean(bucket['Prob of default']), args=(bucket))
        repValues[label] = result.x[0]

In [29]:
#dynamic programming for rating

ratings = ['A', 'B', 'C', 'D', 'E']

sortedlabels = sorted(repValues.keys(), key=lambda x: repValues[x])

ratingMap = {label: rating for label, rating in zip(sortedlabels, ratings)}

print(ratingMap)

{'Very good': 'A', 'Excellent': 'B', 'Good': 'C', 'Fair': 'D', 'Poor': 'E'}


In [30]:
for label, value in repValues.items():
    print(f"{label}: {value:.4f}")

Poor: 0.4296
Fair: 0.1698
Good: 0.0747
Very good: 0.0261
Excellent: 0.0500
