# Feature Engineering

This notebook prepares the Lending Club dataset for modeling by:
- Separating features and target
- Encoding categorical variables
- Splitting into train and test sets
- Scaling numerical features

## Imports & Load Data

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv("../data/raw/loan_data.csv")
df.head()

Unnamed: 0,credit.policy,purpose,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,not.fully.paid
0,1,debt_consolidation,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,0
1,1,credit_card,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,0
2,1,debt_consolidation,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,0
3,1,debt_consolidation,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,0
4,1,credit_card,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,0


## Seperate Features and Target

In [4]:
X = df.drop("not.fully.paid", axis=1)
y = df["not.fully.paid"]

## Encoding Categorical Feature

In [5]:
X = pd.get_dummies(
    X,
    columns=["purpose"],
    drop_first=True
)


In [19]:
X.head(10)

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
0,1,0.1189,829.1,11.350407,19.48,737,5639.958333,28854,52.1,0,0,0,False,True,False,False,False,False
1,1,0.1071,228.22,11.082143,14.29,707,2760.0,33623,76.7,0,0,0,True,False,False,False,False,False
2,1,0.1357,366.86,10.373491,11.63,682,4710.0,3511,25.6,1,0,0,False,True,False,False,False,False
3,1,0.1008,162.34,11.350407,8.1,712,2699.958333,33667,73.2,1,0,0,False,True,False,False,False,False
4,1,0.1426,102.92,11.299732,14.97,667,4066.0,4740,39.5,0,1,0,True,False,False,False,False,False
5,1,0.0788,125.13,11.904968,16.98,727,6120.041667,50807,51.0,0,0,0,True,False,False,False,False,False
6,1,0.1496,194.02,10.714418,4.0,667,3180.041667,3839,76.8,0,0,1,False,True,False,False,False,False
7,1,0.1114,131.22,11.0021,11.08,722,5116.0,24220,68.6,0,0,0,False,False,False,False,False,False
8,1,0.1134,87.19,11.407565,17.25,682,3989.0,69909,51.1,1,0,0,False,False,False,True,False,False
9,1,0.1221,84.12,10.203592,10.0,707,2730.041667,5630,23.0,1,0,0,False,True,False,False,False,False


## Trainâ€“Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [8]:
y_train.value_counts(normalize=True)
y_test.value_counts(normalize=True)


not.fully.paid
0    0.83977
1    0.16023
Name: proportion, dtype: float64

## Feature Scaling

In [9]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
X_train_scaled = pd.DataFrame(
    X_train_scaled,
    columns=X_train.columns,
    index=X_train.index
)

X_test_scaled = pd.DataFrame(
    X_test_scaled,
    columns=X_test.columns,
    index=X_test.index
)

In [13]:
X_train_scaled.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
6381,0.491542,0.744534,1.109194,0.696658,0.012171,-0.629324,0.275685,-0.007634,0.951523,-0.262631,1.591971,-0.238688,2.584058,-0.834196,-0.194033,-0.261668,-0.219765,-0.265913
7589,0.491542,-1.779946,-0.795226,0.174696,0.017975,0.689102,0.234302,-0.156335,-0.150797,-0.726077,-0.302667,-0.238688,2.584058,-0.834196,-0.194033,-0.261668,-0.219765,-0.265913
465,0.491542,-0.912273,-0.889878,-0.252212,-0.101001,0.161732,0.699294,0.07816,-0.339668,-0.726077,1.591971,-0.238688,-0.386988,1.198759,-0.194033,-0.261668,-0.219765,-0.265913
5343,0.491542,0.486476,0.096296,1.097154,0.50694,-0.629324,-0.493588,-0.456364,0.08615,-0.262631,-0.302667,-0.238688,-0.386988,-0.834196,-0.194033,-0.261668,-0.219765,-0.265913
8016,-2.034412,0.318178,0.576044,1.172312,-0.333151,-1.288536,-1.555593,-0.50308,-1.613689,1.127705,-0.302667,-0.238688,-0.386988,1.198759,-0.194033,-0.261668,-0.219765,-0.265913


In [14]:
X_test_scaled.head()

Unnamed: 0,credit.policy,int.rate,installment,log.annual.inc,dti,fico,days.with.cr.line,revol.bal,revol.util,inq.last.6mths,delinq.2yrs,pub.rec,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_major_purchase,purpose_small_business
8157,-2.034412,1.212031,0.48023,0.164232,0.399571,-1.684064,-1.078288,-0.17235,0.793558,-0.726077,-0.302667,-0.238688,-0.386988,1.198759,-0.194033,-0.261668,-0.219765,-0.265913
1928,0.491542,-1.596687,-1.263839,-0.185788,0.933514,0.557259,0.080371,0.317847,-0.040908,-0.726077,-0.302667,-0.238688,-0.386988,1.198759,-0.194033,-0.261668,-0.219765,-0.265913
2779,0.491542,0.568756,1.751231,1.477852,-0.7989,0.029889,0.210437,-0.278081,-0.744882,-0.726077,-0.302667,-0.238688,-0.386988,-0.834196,-0.194033,-0.261668,-0.219765,-0.265913
2520,0.491542,0.213459,-1.234983,-2.144838,-0.311387,-0.233796,-0.894542,-0.483821,-1.48663,-0.262631,-0.302667,-0.238688,-0.386988,-0.834196,-0.194033,-0.261668,-0.219765,-0.265913
2601,0.491542,1.27561,0.146309,0.027415,-0.726354,-0.761166,-0.684511,-0.449551,-0.439255,0.200814,-0.302667,-0.238688,-0.386988,-0.834196,-0.194033,3.821634,-0.219765,-0.265913


## Saving The Data

In [15]:
X_train_scaled.to_csv("../data/processed/X_train.csv", index=False)
X_test_scaled.to_csv("../data/processed/X_test.csv", index=False)
y_train.to_csv("../data/processed/y_train.csv", index=False)
y_test.to_csv("../data/processed/y_test.csv", index=False)

In [16]:
print("Train shape:", X_train_scaled.shape)
print("Test shape:", X_test_scaled.shape)

Train shape: (7662, 18)
Test shape: (1916, 18)


## Feature Engineering Summary

- Separated features and target
- One-hot encoded categorical variables
- Preserved class imbalance using stratified split
- Scaled numerical features using StandardScaler
- Saved processed datasets for modeling
