# Credit Risk — PD Modeling (J.P. Morgan prototype)
- Notebook author: Jaime (Data Scientist)
- Dataset: Task 3 and 4_Loan_Data.csv
- Reference: CREDIT RISK ANALYSIS TASK.docx.

In [1]:
# ## 0. Setup and imports
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report, precision_recall_curve)
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# xgboost
from xgboost import XGBClassifier



## 1. Load data and quick inspection


In [3]:
# Adjust path if needed
DATA_PATH = 'Task 3 and 4_Loan_Data.csv'
df = pd.read_csv(DATA_PATH)

print("Rows, columns:", df.shape)
df.head()

Rows, columns: (10000, 8)


Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [4]:
# Quick datatype/missingness summary
display(df.info())
display(df.describe().T)
display(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               10000 non-null  int64  
 1   credit_lines_outstanding  10000 non-null  int64  
 2   loan_amt_outstanding      10000 non-null  float64
 3   total_debt_outstanding    10000 non-null  float64
 4   income                    10000 non-null  float64
 5   years_employed            10000 non-null  int64  
 6   fico_score                10000 non-null  int64  
 7   default                   10000 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 625.1 KB


None

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_id,10000.0,4974577.0,2293890.0,1000324.0,2977661.0,4989502.0,6967210.0,8999789.0
credit_lines_outstanding,10000.0,1.4612,1.743846,0.0,0.0,1.0,2.0,5.0
loan_amt_outstanding,10000.0,4159.677,1421.399,46.78397,3154.235,4052.377,5052.898,10750.68
total_debt_outstanding,10000.0,8718.917,6627.165,31.65273,4199.836,6732.407,11272.26,43688.78
income,10000.0,70039.9,20072.21,1000.0,56539.87,70085.83,83429.17,148412.2
years_employed,10000.0,4.5528,1.566862,0.0,3.0,5.0,6.0,10.0
fico_score,10000.0,637.5577,60.65791,408.0,597.0,638.0,679.0,850.0
default,10000.0,0.1851,0.3883981,0.0,0.0,0.0,0.0,1.0


customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64

## 2. ETL / Data cleaning
- Drop `customer_id`.
- Impute numeric missing values with median (if any).
- Basic sanity checks: non-negative income, reasonable FICO (300-850).


In [5]:
# Drop customer id if present
if 'customer_id' in df.columns:
    df = df.drop(columns=['customer_id'])

# Sanity checks
print("FICO min/max:", df['fico_score'].min(), df['fico_score'].max())
print("Income min:", df['income'].min())

# If any negative or zero incomes, flag or impute
df = df[df['income'] > 0].copy()  # keep rows with positive income


FICO min/max: 408 850
Income min: 1000.0


## 3. Feature engineering
Create:
- debt_to_income = total_debt_outstanding / income
- loan_to_income = loan_amt_outstanding / income
- fico_bucket (categorical)


In [None]:
df['debt_to_income'] = df['total_debt_outstanding'] / df['income']
df['loan_to_income'] = df['loan_amt_outstanding'] / df['income']

# FICO bucket
def fico_bucket(f):
    if f < 580:
        return 'deep_subprime'
    elif f < 640:
        return 'subprime'
    elif f < 700:
        return 'near_prime'
    else:
        return 'prime'
df['fico_bucket'] = df['fico_score'].apply(fico_bucket)

# Quick check
df[['fico_score','fico_bucket','debt_to_income','loan_to_income']].head()


Unnamed: 0,fico_score,fico_bucket,debt_to_income,loan_to_income
0,605,subprime,0.050173,0.066909
1,572,deep_subprime,0.308789,0.07351
2,602,subprime,0.030787,0.051058
3,612,subprime,0.033645,0.064105
4,631,subprime,0.075435,0.057395


## 4. Prepare features and split data
- Use numeric features: credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding,
  income, years_employed, fico_score, debt_to_income, loan_to_income
- Encode fico_bucket if needed (one-hot) — I'll use numeric features only for tree models (they don't need scaling).
- Stratified train/test split on `default`.