In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../data/Prosper_Synthetic_Elasticity.csv")

In [3]:
print(f"Initial Shape: {df.shape}")

Initial Shape: (1419750, 84)


In [4]:
df["risk_score_norm"] = (df["CreditScoreRangeLower"] - 300) / 550

In [5]:
df["annual_inc"] = df["StatedMonthlyIncome"] * 12
df["annual_inc"] = df["annual_inc"].replace(0, 0.01)  # Safety check
df["loan_to_income"] = df["LoanOriginalAmount"] / df["annual_inc"]

In [6]:
df["FirstRecordedCreditLine"] = pd.to_datetime(
    df["FirstRecordedCreditLine"], errors="coerce"
)
df["ListingCreationDate"] = pd.to_datetime(df["ListingCreationDate"], errors="coerce")

df["relationship_depth_years"] = (
    df["ListingCreationDate"] - df["FirstRecordedCreditLine"]
).dt.days / 365
df["relationship_depth_years"] = df["relationship_depth_years"].fillna(0)

In [7]:
df["term_years"] = df["Term"] / 12

In [8]:
df["est_revenue"] = df["LoanOriginalAmount"] * df["OfferedRate"] * df["term_years"]
df["clv_segment"] = pd.qcut(df["est_revenue"], q=3, labels=["Low", "Medium", "High"])

In [9]:
df["price_sensitivity_proxy"] = np.where(
    df["ProsperScore"] >= 9, "High_Sensitivity", "Low_Sensitivity"
)

In [10]:
features = [
    # The "Levers"
    "OfferedRate",
    "LoanOriginalAmount",
    "Term",
    # Borrower Attributes
    "ProsperScore",
    "CreditScoreRangeLower",
    "DebtToIncomeRatio",
    "StatedMonthlyIncome",
    "EmploymentStatus",
    "IsBorrowerHomeowner",
    # Engineered Features
    "risk_score_norm",
    "loan_to_income",
    "relationship_depth_years",
    "clv_segment",
    "price_sensitivity_proxy",
]

In [11]:
X = df[features].copy()
y = df["Accepted"]

In [12]:
num_cols = X.select_dtypes(include=["float64", "int64"]).columns
X[num_cols] = X[num_cols].fillna(0)

In [13]:
categorical_cols = ["EmploymentStatus", "clv_segment", "price_sensitivity_proxy"]

In [14]:
X["IsBorrowerHomeowner"] = X["IsBorrowerHomeowner"].astype(int)

X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Dataset Shapes:")
print(f"Train: {X_train.shape} (Fit the Logistic/XGBoost Model)")
print(f"Val:   {X_val.shape}   (Hyperparameter Tuning)")
print(f"Test:  {X_test.shape}  (Final Evaluation)")

Dataset Shapes:
Train: (851850, 21) (Fit the Logistic/XGBoost Model)
Val:   (283950, 21)   (Hyperparameter Tuning)
Test:  (283950, 21)  (Final Evaluation)


In [15]:
print("Saving processed datasets...")
X_train.to_csv("../data/processed/prosper_X_train.csv", index=False)
y_train.to_csv("../data/processed/prosper_y_train.csv", index=False)
X_val.to_csv("../data/processed/prosper_X_val.csv", index=False)
y_val.to_csv("../data/processed/prosper_y_val.csv", index=False)
X_test.to_csv("../data/processed/prosper_X_test.csv", index=False)
y_test.to_csv("../data/processed/prosper_y_test.csv", index=False)
print("Datasets saved successfully.")

Saving processed datasets...
Datasets saved successfully.
