In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
import statsmodels.api as sm

In [2]:
# Example synthetic data - realistic mortgage features
np.random.seed(42)
n_samples = 1000
df = pd.DataFrame({
    'interest_rate': np.random.uniform(2.5, 6.5, n_samples),        # Interest rate %
    'credit_score': np.random.normal(700, 50, n_samples),          # Credit score
    'ltv': np.random.uniform(60, 100, n_samples),                   # Loan-to-value %
    'loan_term': np.random.choice([15, 30], n_samples),             # Term in years
    'income': np.random.normal(75000, 15000, n_samples),            # Borrower's income
})

# Clean unrealistic credit scores and income
df['credit_score'] = df['credit_score'].clip(300, 850)
df['income'] = df['income'].clip(20000, 200000)

In [3]:
# True coefficients (choose meaningful signs and magnitudes)
beta = {
    'intercept': -5,
    'interest_rate': 0.6,    # Higher rates -> more prepayment
    'credit_score': -0.005,  # Higher credit score -> less prepayment
    'ltv': 0.04,             # Higher LTV -> more prepayment
    'loan_term': -0.1,       # Longer term -> less prepayment
    'income': 0.00001        # Higher income -> slightly more prepayment
}

# Calculate log-odds for prepayment
log_odds = (
    beta['intercept'] +
    beta['interest_rate'] * df['interest_rate'] +
    beta['credit_score'] * df['credit_score'] +
    beta['ltv'] * df['ltv'] +
    beta['loan_term'] * df['loan_term'] +
    beta['income'] * df['income']
)

# Convert to probabilities with logistic function
def logistic(x):
    return 1 / (1 + np.exp(-x))

p = logistic(log_odds)

In [6]:
# Simulate prepaid outcome depending on features
df['prepaid'] = np.random.binomial(1, p)

# Prepare features and target
X = df[['interest_rate', 'credit_score', 'ltv', 'loan_term', 'income']]
y = df['prepaid']

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# interal optimization with Maximun Likelihood Estimation
# Add intercept for statsmodels
X_sm = sm.add_constant(X_scaled)
# Fit logistic regression using statsmodels
model = sm.Logit(y, X_sm)
result = model.fit()

print(result.summary())

# Odds ratios with confidence intervals
params = result.params
conf = result.conf_int()

#the odds ratio, exp(coef), (OR-1) is the influence ratio
conf['OR'] = params 

conf.columns = ['2.5%', '97.5%', 'OR']
print("\nOdds Ratios and 95% CI:\n", np.exp(conf))

Optimization terminated successfully.
         Current function value: 0.102734
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                prepaid   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      994
Method:                           MLE   Df Model:                            5
Date:                Tue, 27 May 2025   Pseudo R-squ.:                  0.1478
Time:                        16:05:49   Log-Likelihood:                -102.73
converged:                       True   LL-Null:                       -120.55
Covariance Type:            nonrobust   LLR p-value:                 1.126e-06
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -4.3910      0.339    -12.970      0.000      -5.055      -3.727
x1             0.7593      0.