In [9]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# 1. Data Download, Cleaning & Exploration

In [10]:
df = pd.read_csv(r"C:\Users\MATTEO IENTILE\Desktop\PK\Forage JS\JP Quantitative Research\Task 3 and 4_Loan_Data.csv")

We have to predict the **probability of default**, therefore the problem lays into a **classification problem**. Since we have around 10k rows, we can adopt several Machine Learning algorithms:
- logistic regression
- decision tree/random forest
- SVM
- XGBoost

In [41]:
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [11]:
df.describe()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4974577.0,1.4612,4159.677034,8718.916797,70039.901401,4.5528,637.5577,0.1851
std,2293890.0,1.743846,1421.399078,6627.164762,20072.214143,1.566862,60.657906,0.388398
min,1000324.0,0.0,46.783973,31.652732,1000.0,0.0,408.0,0.0
25%,2977661.0,0.0,3154.235371,4199.83602,56539.867903,3.0,597.0,0.0
50%,4989502.0,1.0,4052.377228,6732.407217,70085.82633,5.0,638.0,0.0
75%,6967210.0,2.0,5052.898103,11272.26374,83429.166133,6.0,679.0,0.0
max,8999789.0,5.0,10750.67781,43688.7841,148412.1805,10.0,850.0,1.0


In [12]:
df.corr(numeric_only=True)

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
customer_id,1.0,0.006729,-0.013857,0.003541,-0.008064,-0.008098,0.008044,0.006927
credit_lines_outstanding,0.006729,1.0,0.080249,0.85221,0.022272,-0.0879,-0.258177,0.862815
loan_amt_outstanding,-0.013857,0.080249,1.0,0.397403,0.835815,-0.158416,-0.031373,0.098978
total_debt_outstanding,0.003541,0.85221,0.397403,1.0,0.394397,-0.174353,-0.232246,0.758868
income,-0.008064,0.022272,0.835815,0.394397,1.0,0.001814,-0.010528,0.016309
years_employed,-0.008098,-0.0879,-0.158416,-0.174353,0.001814,1.0,0.255873,-0.284506
fico_score,0.008044,-0.258177,-0.031373,-0.232246,-0.010528,0.255873,1.0,-0.324515
default,0.006927,0.862815,0.098978,0.758868,0.016309,-0.284506,-0.324515,1.0


In [13]:
df.dtypes

customer_id                   int64
credit_lines_outstanding      int64
loan_amt_outstanding        float64
total_debt_outstanding      float64
income                      float64
years_employed                int64
fico_score                    int64
default                       int64
dtype: object

In [14]:
df.isnull().sum()

customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64

# 2. Training, Validation and Test set split

In [15]:
from sklearn.model_selection import train_test_split

#train + rest (validation+test)
df_train, df_rest = train_test_split(
    df, test_size=0.2, random_state=42)  

# validation + test from temp
df_val, df_test = train_test_split(
    df_rest, test_size=0.5, random_state=42)  


In [16]:
input_cols = ["credit_lines_outstanding", 
              "loan_amt_outstanding", 
              "total_debt_outstanding",
              "income",
              "years_employed",
              "fico_score"]

X_train = df_train[input_cols]
y_train = df_train["default"]

X_val = df_val[input_cols]
y_val = df_val["default"]

X_test = df_test[input_cols]
y_test = df_test["default"]

# 3. Models

### Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression

In [20]:
model = LogisticRegression(solver="liblinear", penalty="l2") #this is one of the solver available
model.fit(X_train, y_train)

train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

In [21]:
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

In [22]:
from sklearn.metrics import accuracy_score

print("Train accuracy - Logistic Regression:", accuracy_score(y_train, train_preds))
print("Validation accuracy - Logistic Regression:", accuracy_score(y_val, val_preds))

Train accuracy - Logistic Regression: 0.982625
Validation accuracy - Logistic Regression: 0.978


## Decision Tree

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [24]:
model = DecisionTreeClassifier(random_state=42, max_depth=7, max_leaf_nodes=128)
model.fit(X_train, y_train)

In [25]:
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

In [26]:
print("Train accuracy - Decision Tree:", accuracy_score(y_train, train_preds))
print("Validation accuracy - Decision Tree:", accuracy_score(y_val, val_preds))

Train accuracy - Decision Tree: 0.998625
Validation accuracy - Decision Tree: 0.992


### Random Forest

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
model = RandomForestClassifier(n_jobs=-1, 
                               random_state=42, 
                               n_estimators=100,
                               max_features=10,
                               max_depth=15,
                               class_weight={0: 1, 1: 2}
                              )
model.fit(X_train, y_train)

In [29]:
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

In [30]:
print("Train accuracy - Random Forest:", accuracy_score(y_train, train_preds))
print("Validation accuracy - Random Forest:", accuracy_score(y_val, val_preds))

Train accuracy - Random Forest: 1.0
Validation accuracy - Random Forest: 0.993


### XGBoost

In [31]:
from xgboost import XGBClassifier

In [32]:
model = XGBClassifier(
    max_depth=3,
    learning_rate=0.1,
    n_estimators=100,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    scale_pos_weight=2
)

model.fit(X_train, y_train)

In [33]:
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

In [34]:
print("Train accuracy - XGBoost:", accuracy_score(y_train, train_preds))
print("Validation accuracy - XGBoost:", accuracy_score(y_val, val_preds))

Train accuracy - XGBoost: 0.998125
Validation accuracy - XGBoost: 0.994


# 4. Function

In [35]:
#MODEL SELECTED
model = RandomForestClassifier(n_jobs=-1, 
                               random_state=42, 
                               n_estimators=100,
                               max_features=10,
                               max_depth=15,
                               class_weight={0: 1, 1: 2}
                              )
model.fit(X_train, y_train)

In [36]:
#Accuracy
test_preds = model.predict(X_val)
print("Validation accuracy:", accuracy_score(y_test, test_preds))

Validation accuracy: 0.715


In [37]:
def predict(new_input, loan_price):
    X_new_input = pd.DataFrame([new_input])
    prediction = model.predict(X_new_input)[0]
    prob = model.predict_proba(X_new_input)[0]
    prob_default = prob[1]
    expected_loss = loan_price * 0.9 * prob_default
    
    print(f"Prediction (0=No default, 1=Default): {prediction}")
    print(f"Probability of default: {prob_default * 100:.2f}%")
    print(f"Expected loss (assuming 90% loss given default): ${expected_loss:,.2f}")


In [39]:
new_input = {"credit_lines_outstanding": 0,
                "loan_amt_outstanding": 5221.545193,
                "total_debt_outstanding": 3915.471226,
                "income": 25000.38546,
                "years_employed" : 5,
                "fico_score": 450
}

In [40]:
predict(new_input, 100000)

Prediction (0=No default, 1=Default): 0
Probability of default: 1.00%
Expected loss (assuming 90% loss given default): $900.00
