In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("../data/accepted_2007_to_2018Q4.csv")

  df = pd.read_csv("../data/accepted_2007_to_2018Q4.csv")


In [3]:
target_mask = df["loan_status"].isin(["Fully Paid", "Charged Off", "Default"])
df_clean = df[target_mask].copy()

In [4]:
df_clean["target"] = df_clean["loan_status"].apply(
    lambda x: 0 if x == "Fully Paid" else 1
)

In [5]:
df_clean["risk_score_norm"] = (df_clean["fico_range_low"] - 300) / 550

In [6]:
df_clean["annual_inc"] = df_clean["annual_inc"].replace(0, 0.01)  # Avoid div by zero
df_clean["loan_to_income"] = df_clean["loan_amnt"] / df_clean["annual_inc"]

In [7]:
df_clean["earliest_cr_line"] = pd.to_datetime(
    df_clean["earliest_cr_line"], errors="coerce"
)
df_clean["issue_d"] = pd.to_datetime(df_clean["issue_d"], errors="coerce")
df_clean["relationship_depth_years"] = (
    df_clean["issue_d"] - df_clean["earliest_cr_line"]
).dt.days / 365
df_clean["relationship_depth_years"] = df_clean["relationship_depth_years"].fillna(0)

  df_clean["earliest_cr_line"] = pd.to_datetime(
  df_clean["issue_d"] = pd.to_datetime(df_clean["issue_d"], errors="coerce")


In [8]:
df_clean["term_years"] = df_clean["term"].str.extract("(\d+)").astype(float) / 12
df_clean["est_revenue"] = (
    df_clean["loan_amnt"] * (df_clean["int_rate"] / 100) * df_clean["term_years"]
)
df_clean["clv_segment"] = pd.qcut(
    df_clean["est_revenue"], q=3, labels=["Low", "Medium", "High"]
)

In [9]:
df_clean["price_sensitivity_proxy"] = np.where(
    df_clean["fico_range_low"] > 720, "High_Sensitivity", "Low_Sensitivity"
)

In [10]:
leakage_cols = [
    "hardship_flag",
    "pymnt_plan",
    "recoveries",
    "collection_recovery_fee",
    "total_pymnt",
    "total_rec_prncp",
    "last_pymnt_d",
    "last_credit_pull_d",
]
df_clean.drop(columns=leakage_cols, errors="ignore", inplace=True)

In [11]:
features = [
    # Core Financials
    "loan_amnt",
    "term_years",
    "int_rate",
    "installment",
    "annual_inc",
    "dti",
    "revol_util",
    "revol_bal",
    "total_acc",
    "home_ownership",
    # Engineered Features
    "risk_score_norm",
    "loan_to_income",
    "relationship_depth_years",
    "clv_segment",
    "price_sensitivity_proxy",
    # Categoricals
    "grade",
    "sub_grade",
    "verification_status",
    "purpose",
    "emp_length",
]

In [12]:
X = df_clean[features].copy()
y = df_clean["target"]

In [13]:
num_cols = X.select_dtypes(include=["float64", "int64"]).columns
X[num_cols] = X[num_cols].fillna(0)

In [14]:
grade_map = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
X["grade"] = X["grade"].map(grade_map)

In [15]:
categorical_cols = [
    "sub_grade",
    "home_ownership",
    "verification_status",
    "purpose",
    "emp_length",
    "clv_segment",
    "price_sensitivity_proxy",
]
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [None]:
print(f"Type before rename: {type(X)}")
X = X.rename(columns={"emp_length_< 1": "emp_length_less_than_1"})
print(f"Type after rename: {type(X)}")

Type before rename: <class 'pandas.core.frame.DataFrame'>
Type after rename: <class 'pandas.core.frame.DataFrame'>


In [17]:
print(f"Type of X: {type(X)}")
print(f"Type of y: {type(y)}")

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

Type of X: <class 'pandas.core.frame.DataFrame'>
Type of y: <class 'pandas.core.series.Series'>


In [18]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Dataset Shapes:")
print(f"Train: {X_train.shape} (Used for Model Fitting)")
print(f"Val:   {X_val.shape}   (Used for Early Stopping / Tuning)")
print(f"Test:  {X_test.shape}  (Used for Final Evaluation)")
print(
    f"\nEngineered Features Included: {['risk_score_norm', 'loan_to_income', 'relationship_depth_years', 'clv_segment', 'price_sensitivity_proxy']}"
)

Dataset Shapes:
Train: (807210, 80) (Used for Model Fitting)
Val:   (269070, 80)   (Used for Early Stopping / Tuning)
Test:  (269070, 80)  (Used for Final Evaluation)

Engineered Features Included: ['risk_score_norm', 'loan_to_income', 'relationship_depth_years', 'clv_segment', 'price_sensitivity_proxy']


In [19]:
print("Saving processed datasets...")
X_train.to_csv("../data/processed/lending_club_X_train.csv", index=False)
y_train.to_csv("../data/processed/lending_club_y_train.csv", index=False)
X_val.to_csv("../data/processed/lending_club_X_val.csv", index=False)
y_val.to_csv("../data/processed/lending_club_y_val.csv", index=False)
X_test.to_csv("../data/processed/lending_club_X_test.csv", index=False)
y_test.to_csv("../data/processed/lending_club_y_test.csv", index=False)
print("Datasets saved.")

Saving processed datasets...
Datasets saved.
