# IRB PD Model – Cleaned Notebook

In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


## Data Loading and Feature Engineering

In [None]:

df = pd.read_csv(r"C:\Users\mel\1. IRK\hypo_credit_risk_dataset1.csv")

df['log_annual_income'] = np.log1p(df['annual_income'])
df['loan_to_income_ratio'] = df['exposure_at_default'] / df['annual_income']
df['credit_score_bin'] = pd.qcut(df['credit_score_internal'], q=5, labels=False)
df['age_bucket'] = pd.cut(
    df['age_years'],
    bins=[18, 30, 45, 60, 10000],
    labels=["18-30", "31-45", "46-60", "61+"]
)


## Visual Check: Default Rate by Rating Grade

In [None]:

rating_default_rate = df.groupby('rating_grade')['default_flag'].mean()
sns.barplot(x=rating_default_rate.index, y=rating_default_rate.values)
plt.title("Default Rate by Rating Grade")
plt.ylabel("Default Rate")
plt.xlabel("Rating Grade")
plt.show()


## Train/Test Split Based on Snapshot Year

In [None]:

val_year = 2024
train_df = df[df['snapshot_year'] < val_year].copy()
test_df = df[df['snapshot_year'] == val_year].copy()


## One-Hot Encoding + Target

In [None]:

categorical_vars = ['customer_region', 'industry_sector', 'housing_status', 'marital_status']

X_train_encoded = pd.get_dummies(train_df, columns=categorical_vars, prefix=categorical_vars)
X_test_encoded = pd.get_dummies(test_df, columns=categorical_vars, prefix=categorical_vars)
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)

X_train_encoded['customer_type_num'] = (train_df['customer_type'] == 'retail').astype(int)
X_test_encoded['customer_type_num'] = (test_df['customer_type'] == 'retail').astype(int)

y_train_encoded = train_df['default_flag']
y_test_encoded = test_df['default_flag']

drop_cols = ['customer_type', 'customer_region', 'industry_sector', 'housing_status', 'marital_status']
X_train_encoded.drop(columns=drop_cols, inplace=True, errors='ignore')
X_test_encoded.drop(columns=drop_cols, inplace=True, errors='ignore')

X_train_encoded = X_train_encoded.select_dtypes(include=[np.number])
X_test_encoded = X_test_encoded.select_dtypes(include=[np.number])


## Feature Scaling

In [None]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

means = X_train_scaled.mean(axis=0)
stds = X_train_scaled.std(axis=0)

scaling_check = pd.DataFrame({'mean': means, 'std': stds}, index=X_train_encoded.columns)
deviations = scaling_check[(abs(means) > 0.05) | (abs(stds - 1) > 0.05)]

if deviations.empty:
    print("Scaling approved. All means ≈ 0 and stds ≈ 1.")
else:
    print("Some features deviate from expected scaling:")
    display(deviations.round(3))
