In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import (accuracy_score, confusion_matrix, classification_report,
                             roc_curve, roc_auc_score, precision_score, recall_score,
                             f1_score, log_loss, brier_score_loss)
import warnings
warnings.filterwarnings('ignore')

In [13]:
pip install xlrd

Collecting xlrd
  Downloading xlrd-2.0.2-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading xlrd-2.0.2-py2.py3-none-any.whl (96 kB)
   ---------------------------------------- 0.0/96.6 kB ? eta -:--:--
   ---------------------------------------- 0.0/96.6 kB ? eta -:--:--
   ---------------------------------------- 0.0/96.6 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/96.6 kB ? eta -:--:--
   ---- ----------------------------------- 10.2/96.6 kB ? eta -:--:--
   ------------ --------------------------- 30.7/96.6 kB 262.6 kB/s eta 0:00:01
   ------------ --------------------------- 30.7/96.6 kB 262.6 kB/s eta 0:00:01
   ------------------------- -------------- 61.4/96.6 kB 299.4 kB/s eta 0:00:01
   -------------------------------------- - 92.2/96.6 kB 403.5 kB/s eta 0:00:01
   ---------------------------------------- 96.6/96.6 kB 345.3 kB/s eta 0:00:00
Installing collected packages: xlrd
Successfully installed xlrd-2.0.2
Note: you may need to restart the kernel to us

In [29]:
import xlrd

In [3]:
np.random.seed(40)

In [5]:
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

In [31]:
import os

OUTPUT_DIR = os.path.join(os.getcwd(), 'outputs')
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [39]:
DATA_FILE = 'default.xls'
df = pd.read_excel(DATA_FILE, header=1)

In [41]:
print(f"Shape: {df.shape[0]:,} observations, {df.shape[1]} variables")

Shape: 30,000 observations, 25 variables


In [43]:
print("\nVariable Descriptions:")
print("-" * 60)
descriptions = {
    'ID': 'Customer ID',
    'LIMIT_BAL': 'Credit limit (NT dollars)',
    'SEX': 'Gender (1=male, 2=female)',
    'EDUCATION': 'Education (1=grad school, 2=university, 3=high school, 4=other)',
    'MARRIAGE': 'Marital status (1=married, 2=single, 3=other)',
    'AGE': 'Age in years',
    'PAY_0 to PAY_6': 'Repayment status (-1=paid duly, 1-9=months delayed)',
    'BILL_AMT1-6': 'Bill statement amount (NT dollars)',
    'PAY_AMT1-6': 'Previous payment amount (NT dollars)',
    'default payment next month': 'Default indicator (0=No, 1=Yes) - TARGET'
}
for var, desc in descriptions.items():
    print(f"  {var}: {desc}")


Variable Descriptions:
------------------------------------------------------------
  ID: Customer ID
  LIMIT_BAL: Credit limit (NT dollars)
  SEX: Gender (1=male, 2=female)
  EDUCATION: Education (1=grad school, 2=university, 3=high school, 4=other)
  MARRIAGE: Marital status (1=married, 2=single, 3=other)
  AGE: Age in years
  PAY_0 to PAY_6: Repayment status (-1=paid duly, 1-9=months delayed)
  BILL_AMT1-6: Bill statement amount (NT dollars)
  PAY_AMT1-6: Previous payment amount (NT dollars)
  default payment next month: Default indicator (0=No, 1=Yes) - TARGET


In [45]:
default_counts = df['default payment next month'].value_counts()
print(f"  No Default (0): {default_counts[0]:,} ({default_counts[0]/len(df)*100:.1f}%)")
print(f"  Default (1):    {default_counts[1]:,} ({default_counts[1]/len(df)*100:.1f}%)")
print(f"\n  Note: Imbalanced dataset with {default_counts[1]/len(df)*100:.1f}% default rate")

  No Default (0): 23,364 (77.9%)
  Default (1):    6,636 (22.1%)

  Note: Imbalanced dataset with 22.1% default rate


In [47]:
print("\n\nDescriptive Statistics for Key Predictors:")
print("-" * 60)
key_vars = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'PAY_AMT1']
print(df[key_vars].describe().round(2))



Descriptive Statistics for Key Predictors:
------------------------------------------------------------
        LIMIT_BAL       AGE  BILL_AMT1   PAY_AMT1
count    30000.00  30000.00   30000.00   30000.00
mean    167484.32     35.49   51223.33    5663.58
std     129747.66      9.22   73635.86   16563.28
min      10000.00     21.00 -165580.00       0.00
25%      50000.00     28.00    3558.75    1000.00
50%     140000.00     34.00   22381.50    2100.00
75%     240000.00     41.00   67091.00    5006.00
max    1000000.00     79.00  964511.00  873552.00


In [49]:
print(f"\nMissing Values: {df.isnull().sum().sum()}")


Missing Values: 0


In [51]:
df = df.drop('ID', axis=1)

In [53]:
y = df['default payment next month']
X = df.drop('default payment next month', axis=1)

In [55]:
print(f"\nTarget Variable: default payment next month")
print(f"Number of Features: {X.shape[1]}")


Target Variable: default payment next month
Number of Features: 23


In [57]:
print(f"\nFeature Names:")
for i, col in enumerate(X.columns, 1):
    print(f"  {i:2d}. {col}")


Feature Names:
   1. LIMIT_BAL
   2. SEX
   3. EDUCATION
   4. MARRIAGE
   5. AGE
   6. PAY_0
   7. PAY_2
   8. PAY_3
   9. PAY_4
  10. PAY_5
  11. PAY_6
  12. BILL_AMT1
  13. BILL_AMT2
  14. BILL_AMT3
  15. BILL_AMT4
  16. BILL_AMT5
  17. BILL_AMT6
  18. PAY_AMT1
  19. PAY_AMT2
  20. PAY_AMT3
  21. PAY_AMT4
  22. PAY_AMT5
  23. PAY_AMT6


In [59]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=40, stratify=y
)

In [61]:
print(f"\nData Split (Stratified by target):")
print(f"  Training set: {X_train.shape[0]:,} observations ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"    - Default rate: {y_train.mean()*100:.1f}%")
print(f"  Test set: {X_test.shape[0]:,} observations ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"    - Default rate: {y_test.mean()*100:.1f}%")


Data Split (Stratified by target):
  Training set: 24,000 observations (80.0%)
    - Default rate: 22.1%
  Test set: 6,000 observations (20.0%)
    - Default rate: 22.1%


In [63]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [65]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X.columns)

In [67]:
results = {}

In [69]:
logistic_model = LogisticRegression(penalty='l2', C=1e10, solver='lbfgs', 
                                     max_iter=1000, random_state=40)
logistic_model.fit(X_train_scaled, y_train)

In [71]:
y_pred_log_train = logistic_model.predict(X_train_scaled)
y_pred_log_test = logistic_model.predict(X_test_scaled)
y_prob_log_train = logistic_model.predict_proba(X_train_scaled)[:, 1]
y_prob_log_test = logistic_model.predict_proba(X_test_scaled)[:, 1]

In [73]:
results['Logistic'] = {
    'Train Accuracy': accuracy_score(y_train, y_pred_log_train),
    'Test Accuracy': accuracy_score(y_test, y_pred_log_test),
    'Train AUC': roc_auc_score(y_train, y_prob_log_train),
    'Test AUC': roc_auc_score(y_test, y_prob_log_test),
    'Train Log Loss': log_loss(y_train, y_prob_log_train),
    'Test Log Loss': log_loss(y_test, y_prob_log_test),
    'Brier Score': brier_score_loss(y_test, y_prob_log_test),
    'Precision': precision_score(y_test, y_pred_log_test),
    'Recall': recall_score(y_test, y_pred_log_test),
    'F1 Score': f1_score(y_test, y_pred_log_test),
    'C': 1e10,
    'coefficients': logistic_model.coef_[0],
    'intercept': logistic_model.intercept_[0],
    'probabilities_test': y_prob_log_test,
    'predictions_test': y_pred_log_test
}

In [75]:
print(f"\nStandard Logistic Regression Results:")
print(f"  Training Accuracy: {results['Logistic']['Train Accuracy']*100:.2f}%")
print(f"  Test Accuracy:     {results['Logistic']['Test Accuracy']*100:.2f}%")
print(f"  Training AUC:      {results['Logistic']['Train AUC']:.4f}")
print(f"  Test AUC:          {results['Logistic']['Test AUC']:.4f}")
print(f"  Test Log Loss:     {results['Logistic']['Test Log Loss']:.4f}")
print(f"  Test Brier Score:  {results['Logistic']['Brier Score']:.4f}")
print(f"\n  Number of non-zero coefficients: {np.sum(np.abs(logistic_model.coef_[0]) > 1e-6)}")


Standard Logistic Regression Results:
  Training Accuracy: 80.91%
  Test Accuracy:     81.53%
  Training AUC:      0.7219
  Test AUC:          0.7339
  Test Log Loss:     0.4593
  Test Brier Score:  0.1423

  Number of non-zero coefficients: 23


In [77]:
Cs = np.logspace(-4, 4, 20)
ridge_cv = LogisticRegressionCV(Cs=Cs, penalty='l2', solver='lbfgs', 
                                 cv=5, scoring='roc_auc', max_iter=1000,
                                 random_state=40)
ridge_cv.fit(X_train_scaled, y_train)

In [79]:
print(f"\nOptimal C via 5-fold CV: {ridge_cv.C_[0]:.4f}")
print(f"Corresponding λ (1/C): {1/ridge_cv.C_[0]:.4f}")


Optimal C via 5-fold CV: 29.7635
Corresponding λ (1/C): 0.0336


In [81]:
y_pred_ridge_train = ridge_cv.predict(X_train_scaled)
y_pred_ridge_test = ridge_cv.predict(X_test_scaled)
y_prob_ridge_train = ridge_cv.predict_proba(X_train_scaled)[:, 1]
y_prob_ridge_test = ridge_cv.predict_proba(X_test_scaled)[:, 1]

In [83]:
results['Ridge'] = {
    'Train Accuracy': accuracy_score(y_train, y_pred_ridge_train),
    'Test Accuracy': accuracy_score(y_test, y_pred_ridge_test),
    'Train AUC': roc_auc_score(y_train, y_prob_ridge_train),
    'Test AUC': roc_auc_score(y_test, y_prob_ridge_test),
    'Train Log Loss': log_loss(y_train, y_prob_ridge_train),
    'Test Log Loss': log_loss(y_test, y_prob_ridge_test),
    'Brier Score': brier_score_loss(y_test, y_prob_ridge_test),
    'Precision': precision_score(y_test, y_pred_ridge_test),
    'Recall': recall_score(y_test, y_pred_ridge_test),
    'F1 Score': f1_score(y_test, y_pred_ridge_test),
    'C': ridge_cv.C_[0],
    'lambda': 1/ridge_cv.C_[0],
    'coefficients': ridge_cv.coef_[0],
    'intercept': ridge_cv.intercept_[0],
    'probabilities_test': y_prob_ridge_test,
    'predictions_test': y_pred_ridge_test
}

In [85]:
print(f"\nRidge Logistic Regression Results:")
print(f"  Training Accuracy: {results['Ridge']['Train Accuracy']*100:.2f}%")
print(f"  Test Accuracy:     {results['Ridge']['Test Accuracy']*100:.2f}%")
print(f"  Training AUC:      {results['Ridge']['Train AUC']:.4f}")
print(f"  Test AUC:          {results['Ridge']['Test AUC']:.4f}")
print(f"  Test Log Loss:     {results['Ridge']['Test Log Loss']:.4f}")
print(f"  Test Brier Score:  {results['Ridge']['Brier Score']:.4f}")
print(f"\n  Number of non-zero coefficients: {np.sum(np.abs(ridge_cv.coef_[0]) > 1e-6)}")
print(f"  (Ridge shrinks but doesn't eliminate coefficients)")


Ridge Logistic Regression Results:
  Training Accuracy: 80.91%
  Test Accuracy:     81.52%
  Training AUC:      0.7219
  Test AUC:          0.7339
  Test Log Loss:     0.4594
  Test Brier Score:  0.1424

  Number of non-zero coefficients: 23
  (Ridge shrinks but doesn't eliminate coefficients)


In [87]:
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12