# Feature Engineering for Credit Risk Modeling

## Objective
The goal of this notebook is to transform raw borrower information into meaningful behavioral features that reflect repayment capacity and credit risk.

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../data/raw/lendingclub.csv")

  df = pd.read_csv("../data/raw/lendingclub.csv")


In [3]:
valid_status = ["Fully Paid", "Charged Off"]

df = df[df["loan_status"].isin(valid_status)].copy()

df["default"] = np.where(df["loan_status"] == "Charged Off", 1, 0)

In [4]:
df = df[
    [
        "loan_amnt",
        "term",
        "int_rate",
        "annual_inc",
        "dti",
        "fico_range_low",
        "fico_range_high",
        "revol_util",
        "emp_length",
        "default"
    ]
]


In [5]:
df["term"] = df["term"].str.extract(r"(\d+)").astype(int)

In [6]:
df["fico_avg"] = (df["fico_range_low"] + df["fico_range_high"]) / 2

df = df.drop(columns=["fico_range_low", "fico_range_high"])

In [7]:
df.loc[df["annual_inc"] == 0, "annual_inc"] = np.nan


In [8]:
df["loan_to_income"] = df["loan_amnt"] / df["annual_inc"]


In [9]:
df["revol_util"] = df["revol_util"].clip(upper=100)

In [10]:
df["revol_util"] = df["revol_util"].fillna(df["revol_util"].median())

In [11]:
df["emp_length"] = df["emp_length"].replace({
    "10+ years": 10,
    "< 1 year": 0,
    "1 year": 1,
    "2 years": 2,
    "3 years": 3,
    "4 years": 4,
    "5 years": 5,
    "6 years": 6,
    "7 years": 7,
    "8 years": 8,
    "9 years": 9
})

df["emp_length"] = pd.to_numeric(df["emp_length"])

  df["emp_length"] = df["emp_length"].replace({


In [12]:
df["annual_inc"] = df["annual_inc"].fillna(df["annual_inc"].median())
df["dti"] = df["dti"].fillna(df["dti"].median())
df["emp_length"] = df["emp_length"].fillna(df["emp_length"].median())
df["loan_to_income"] = df["loan_to_income"].fillna(df["loan_to_income"].median())

In [13]:
df.dtypes
df.isna().sum()
df.describe()

Unnamed: 0,loan_amnt,term,int_rate,annual_inc,dti,revol_util,emp_length,default,fico_avg,loan_to_income
count,1345310.0,1345310.0,1345310.0,1345310.0,1345310.0,1345310.0,1345310.0,1345310.0,1345310.0,1345310.0
mean,14419.97,41.7902,13.23962,76265.08,18.28248,51.79824,5.967865,0.1996261,698.1851,0.3973044
std,8717.051,10.26832,4.768716,69914.18,11.1589,24.47533,3.58185,0.3997195,31.85284,70.8192
min,500.0,36.0,5.31,1.0,-1.0,0.0,0.0,0.0,627.0,0.0001714286
25%,8000.0,36.0,9.75,45900.0,11.79,33.5,3.0,0.0,672.0,0.1246903
50%,12000.0,36.0,12.74,65000.0,17.61,52.2,6.0,0.0,692.0,0.2
75%,20000.0,36.0,15.99,90000.0,24.05,70.7,10.0,0.0,712.0,0.2909091
max,40000.0,60.0,30.99,10999200.0,999.0,100.0,10.0,1.0,847.5,40000.0


In [15]:
df.loc[df["dti"] < 0, "dti"] = df["dti"].median()

In [16]:
df["loan_to_income"] = df["loan_to_income"].clip(upper=10)

In [17]:
df.dtypes
df.isna().sum()
df.describe()

Unnamed: 0,loan_amnt,term,int_rate,annual_inc,dti,revol_util,emp_length,default,fico_avg,loan_to_income
count,1345310.0,1345310.0,1345310.0,1345310.0,1345310.0,1345310.0,1345310.0,1345310.0,1345310.0,1345310.0
mean,14419.97,41.7902,13.23962,76265.08,18.28251,51.79824,5.967865,0.1996261,698.1851,0.2162308
std,8717.051,10.26832,4.768716,69914.18,11.15888,24.47533,3.58185,0.3997195,31.85284,0.1542117
min,500.0,36.0,5.31,1.0,0.0,0.0,0.0,0.0,627.0,0.0001714286
25%,8000.0,36.0,9.75,45900.0,11.79,33.5,3.0,0.0,672.0,0.1246903
50%,12000.0,36.0,12.74,65000.0,17.61,52.2,6.0,0.0,692.0,0.2
75%,20000.0,36.0,15.99,90000.0,24.05,70.7,10.0,0.0,712.0,0.2909091
max,40000.0,60.0,30.99,10999200.0,999.0,100.0,10.0,1.0,847.5,10.0


In [18]:
df.to_csv("../data/processed/model_data.csv", index=False)

### Alignment with EDA

Feature selection and transformations were informed by EDA insights to ensure consistency between observed risk patterns and engineered behavioral features.