In [1]:
# ============================================
# 02_preprocessing.ipynb â€” Data Preprocessing
# ============================================

import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Ensure processed directory exists
os.makedirs("../data/processed", exist_ok=True)

# ============================================
# 1. Load Cleaned Data
# ============================================

print("ðŸ“Œ Loading cleaned dataset...")

df = pd.read_csv("../data/processed/cleaned_data.csv")
df.head()

ðŸ“Œ Loading cleaned dataset...


Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [2]:
# ============================================
# 2. Basic Check
# ============================================

print(df.info())
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44993 entries, 0 to 44992
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      44993 non-null  float64
 1   person_gender                   44993 non-null  object 
 2   person_education                44993 non-null  object 
 3   person_income                   44993 non-null  float64
 4   person_emp_exp                  44993 non-null  int64  
 5   person_home_ownership           44993 non-null  object 
 6   loan_amnt                       44993 non-null  float64
 7   loan_intent                     44993 non-null  object 
 8   loan_int_rate                   44993 non-null  float64
 9   loan_percent_income             44993 non-null  float64
 10  cb_person_cred_hist_length      44993 non-null  float64
 11  credit_score                    44993 non-null  int64  
 12  previous_loan_defaults_on_file  

Unnamed: 0,person_age,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,loan_status
count,44993.0,44993.0,44993.0,44993.0,44993.0,44993.0,44993.0,44993.0,44993.0
mean,27.748428,79908.45,5.394528,9583.176761,11.006448,0.139736,5.866557,632.585713,0.222257
std,5.909737,63322.13,5.927159,6314.802655,2.978985,0.087207,3.877167,50.402411,0.415767
min,20.0,8000.0,0.0,500.0,5.42,0.0,2.0,390.0,0.0
25%,24.0,47195.0,1.0,5000.0,8.59,0.07,3.0,601.0,0.0
50%,26.0,67046.0,4.0,8000.0,11.01,0.12,4.0,640.0,0.0
75%,30.0,95778.0,8.0,12237.0,12.99,0.19,8.0,670.0,0.0
max,94.0,2448661.0,76.0,35000.0,20.0,0.66,30.0,784.0,1.0


In [3]:
# ============================================
# 3. Encode Target Variable
# ============================================

df["loan_status"] = df["loan_status"].map({"Yes":1, "No":0, 1:1, 0:0})
df["loan_status"].value_counts()

loan_status
0    34993
1    10000
Name: count, dtype: int64

In [4]:
# ============================================
# 4. Split Features / Target
# ============================================

X = df.drop("loan_status", axis=1)
y = df["loan_status"]

categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

print("Categorical:", categorical_cols)
print("Numeric:", numeric_cols)

Categorical: ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']
Numeric: ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score']


In [5]:
# ============================================
# 5. Preprocessing Pipeline
# ============================================

numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)

preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [6]:
# ============================================
# 6. Train/Test Split
# ============================================

print("ðŸ“Œ Splitting train/test dataset...")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape, X_test.shape)

ðŸ“Œ Splitting train/test dataset...
(35994, 13) (8999, 13)


In [7]:
# ============================================
# 7. Fit Preprocessor
# (Important for later model training)
# ============================================

print("ðŸ“Œ Fitting preprocessing pipeline...")

preprocessor.fit(X_train)

print("Done!")

ðŸ“Œ Fitting preprocessing pipeline...
Done!


In [8]:
# ============================================
# 8. Save Preprocessed Splits
# ============================================

print("ðŸ“Œ Transforming and saving processed features...")

X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

import numpy as np

np.save("../data/processed/X_train.npy", X_train_transformed)
np.save("../data/processed/X_test.npy", X_test_transformed)
np.save("../data/processed/y_train.npy", y_train.values)
np.save("../data/processed/y_test.npy", y_test.values)

print("Saved:")
print("../data/processed/X_train.npy")
print("../data/processed/X_test.npy")
print("../data/processed/y_train.npy")
print("../data/processed/y_test.npy")

ðŸ“Œ Transforming and saving processed features...
Saved:
../data/processed/X_train.npy
../data/processed/X_test.npy
../data/processed/y_train.npy
../data/processed/y_test.npy


In [9]:
# ============================================
# 9. Save Preprocessor Object (for modeling)
# ============================================

import joblib

joblib.dump(preprocessor, "../models/preprocessor.pkl")

print("ðŸ“Œ Saved preprocessor to ../models/preprocessor.pkl")

ðŸ“Œ Saved preprocessor to ../models/preprocessor.pkl
