In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [2]:
DATA_PATH = "../data/raw/"
FILE_NAME = "default of credit card clients.xls"

df = pd.read_excel(DATA_PATH + FILE_NAME, header=1)

df.columns = (
    df.columns
    .str.strip()
    .str.upper()
    .str.replace(" ", "_")
)

df = df.rename(columns={"DEFAULT_PAYMENT_NEXT_MONTH": "DEFAULT"})
df.head()


Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,...,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,...,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,...,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
X = df.drop(columns=["DEFAULT"])
y = df["DEFAULT"]

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (30000, 24)
y shape: (30000,)


In [4]:
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Validation size:", X_val.shape)


Train size: (24000, 24)
Validation size: (6000, 24)


In [5]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

X_train_scaled = pd.DataFrame(
    X_train_scaled,
    columns=X_train.columns,
    index=X_train.index
)

X_val_scaled = pd.DataFrame(
    X_val_scaled,
    columns=X_val.columns,
    index=X_val.index
)


In [7]:
OUTPUT_PATH = "../data/processed/"

X_train_scaled.to_csv(OUTPUT_PATH + "X_train.csv", index=False)
X_val_scaled.to_csv(OUTPUT_PATH + "X_val.csv", index=False)
y_train.to_csv(OUTPUT_PATH + "y_train.csv", index=False)
y_val.to_csv(OUTPUT_PATH + "y_val.csv", index=False)

print("Processed datasets saved.")


Processed datasets saved.


## Feature Engineering Summary

- Separated features and target
- Performed stratified train/validation split
- Applied standard scaling to numerical features
- Saved processed datasets for downstream modeling
