In [1]:
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Load Gold features & labels
features = pd.read_parquet("datamart/gold/features.parquet")
labels = pd.read_parquet("datamart/gold/labels.parquet")

# Align features and labels by loan_id (ensure 1-to-1 mapping)
df = features.merge(labels, on="loan_id", how="inner")

# Define X, y
X = df.drop(columns=["loan_id", "customer_id", "label_default"], errors="ignore")
y = df["label_default"]

# Keep only numeric features
X = X.select_dtypes(include="number").fillna(0)

# Train-test split (time-based: no shuffle to avoid leakage)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, shuffle=False
)

# Train baseline logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
preds = model.predict_proba(X_test)[:, 1]

# Evaluate
auc = roc_auc_score(y_test, preds)
print("✅ Sanity check completed")
print(f"Features shape: {X.shape}, Labels: {y.shape}")
print(f"Sanity AUC: {auc:.3f}")

✅ Sanity check completed
Features shape: (218902, 34), Labels: (218902,)
Sanity AUC: 0.671
