In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import roc_auc_score, mean_squared_error, confusion_matrix

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)


EXPORT_PLOTS = False  # Set True only when you want to save figures


In [None]:
# Load data
df = pd.read_csv('../data/loan_data.csv')
df = df.set_index('customer_id')

df.head()

In [None]:
print(df.info())
print(df.describe())

In [None]:
# Check target balance
print(df['default'].value_counts(normalize=True))
sns.countplot(x='default', data=df)
plt.show()

In [None]:
# Exploratory Data Analysis
pair_plot = sns.pairplot(df, hue='default')
plt.show()

if EXPORT_PLOTS:
    pair_plot.savefig("../outputs/credit_data_pairplot.png", dpi=300,
                      bbox_inches='tight')


In [None]:
# Correlation heatmap
figsize = (12, 8)
ax = sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm_r',
                 vmin=-1, vmax=1)
fig = ax.get_figure()
plt.show()

if EXPORT_PLOTS:
    fig.savefig("../outputs/credit_data_correl.png", dpi=300,
                bbox_inches='tight')


In [None]:
# Data Preprocessing
X = df.drop(columns=['default'])
y = df['default']

# Train-test split
X_train_df, X_test_df, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_df)
X_test = scaler.transform(X_test_df)


In [None]:
# Polynomial Regression Pipeline
poly_log_reg = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('log_reg', LogisticRegression(max_iter=500))
])

In [None]:
# Models dictionnary
models = {
    "Dummy Classifier": DummyClassifier(strategy="most_frequent"),
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Polynomial Regression (deg=2)": poly_log_reg,
    "Decision Tree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Random Forest": RandomForestClassifier(class_weight="balanced", random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

In [None]:
# Fit models and evaluate
results = {}

for model_name, model in models.items():

    # Fit
    model.fit(X_train, y_train)

    # Predict + proba
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, y_proba)
    mse = mean_squared_error(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    results[model_name] = {
        "ROC AUC": roc_auc,
        "MSE": mse,
        "Confusion Matrix": cm
    }

# Display results
for model_name, metrics in results.items():
    print(f"Model: {model_name}")
    print(f"ROC AUC: {metrics['ROC AUC']:.4f}")
    print(f"MSE: {metrics['MSE']:.4f}")
    print("Confusion Matrix:")
    print(metrics['Confusion Matrix'])
    print("-" * 30)

### Model Comparison and Interpretation

Below are some notes to help interpret the results you observed:

#### 1. Dummy Classifier
- Predicts only the majority class.
- Explains the ROC AUC of 0.50.
- Cannot detect any pattern in the data.

#### 2. Logistic Regression
- Performs extremely well here (AUC = 1.00).
- Works best when the true decision boundary is close to linear.
- Can be influenced by outliers because coefficients shift to fit extreme points.

#### 3. Polynomial Regression (degree=2)
- Adds interactions and non-linearities, which increases flexibility.
- Can fit noise and outliers too strongly due to squared terms.
- Here it performs almost perfectly, meaning the dataset is simple or clean.

#### 4. Decision Tree
- Very flexible and learns sharp rules.
- Highly sensitive to outliers: a single extreme point can create an unnecessary split.
- Slightly higher MSE and a few more misclassifications confirm mild overfitting.

#### 5. KNN
- Distance-based model: outliers distort neighborhood structure.
- Works well on clean, dense data, but high dimensions or unscaled features hurt performance.
- The confusion matrix suggests a few more false negatives caused by mis-localized neighbors.

#### 6. Random Forest
- More stable than a single tree due to averaging.
- Still somewhat sensitive if outliers repeatedly affect splits across trees.
- Strong performance suggests features are clean and informative.

#### 7. Gradient Boosting
- Sequentially focuses on correcting previous errors.
- Can overfit outliers if not tuned, because it keeps pushing on difficult points.
- Very strong AUC indicates the dataset is easy to separate.

### Why outliers matter in these models

- **Linear models:** boundary shifts to accommodate extreme points.
- **Polynomial:** extremes are amplified through squared terms.
- **Trees:** outliers trigger deep splits, creating overfitting.
- **KNN:** distances become misleading.
- **Ensembles:** more robust, but still influenced if the same outliers appear often.

Overall, the dataset seems extremely separable, which explains why most models reach very high ROC AUC values.

In [None]:
# Convert results dict â†’ DataFrame
html_rows = []
for model_name, metrics in results.items():
    cm = metrics["Confusion Matrix"]
    tn, fp, fn, tp = cm.ravel()

    html_rows.append({
        "Model": model_name,
        "ROC AUC": round(metrics["ROC AUC"], 4),
        "MSE": round(metrics["MSE"], 4),
        "TN": tn,
        "FP": fp,
        "FN": fn,
        "TP": tp
    })

df_results = pd.DataFrame(html_rows)

# Save as HTML file (GitHub-friendly)
html_path = "../outputs/model_results.html"
df_results.to_html(html_path, index=False)

print(f"Saved HTML summary to {html_path}")
df_results

In [None]:
def calculate_expected_loss(model, X_input, loan_amount, recovery_rate=0.1):
    pd = model.predict_proba(X_input)[:, 1]  # Probability of Default
    ead = loan_amount                        # Exposure at Default
    lgd = 1 - recovery_rate                  # Loss Given Default
    el = pd * lgd * ead                      # Expected Loss
    return el[0]

In [None]:
# Example
example_loan = X_test[0].reshape(1, -1)
loan_amount = X_test_df.loc[X_test_df.index[0], 'loan_amt_outstanding']
el = calculate_expected_loss(models['Logistic Regression'], example_loan,
                             loan_amount)
print(f"Expected Loss for the example loan: {el:.2f}")
