# Credit Risk Modelling
**Author:** Mohsin Iqbal

Predict default risk from tabular credit data.

> Replace `DATA_PATH` with a real dataset path when ready. This notebook will run out of the box on the included synthetic sample.

In [None]:
DATA_PATH = 'data/german_credit_sample.csv'  # replace with real dataset path when ready
SAVE_DIR = 'assets'
RESULTS_JSON = 'results.json'

import os
os.makedirs(SAVE_DIR, exist_ok=True)

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, precision_recall_curve, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Load data
 df = pd.read_csv(DATA_PATH)
print(df.head())
print(df['default'].value_counts(normalize=True))

## Preprocessing
- Separate features/target
- Identify numeric/categorical columns
- Build ColumnTransformer
- Train/test split

In [None]:
target = 'default'
X = df.drop(columns=[target])
y = df[target]

num_cols = X.select_dtypes(include=['int64','float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

numeric_tf = Pipeline(steps=[('scaler', StandardScaler())])
categorical_tf = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_tf, num_cols),
        ('cat', categorical_tf, cat_cols)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
len(X_train), len(X_test)

## Baseline: Logistic Regression

In [None]:
logit = Pipeline([
    ('prep', preprocess),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', n_jobs=None))
])
logit.fit(X_train, y_train)
proba_lr = logit.predict_proba(X_test)[:,1]
preds_lr = (proba_lr >= 0.5).astype(int)
auc_lr = roc_auc_score(y_test, proba_lr)
pr_auc_lr = average_precision_score(y_test, proba_lr)
print('Logistic Regression AUC:', round(auc_lr,3), 'PR-AUC:', round(pr_auc_lr,3))
print(classification_report(y_test, preds_lr))

## Random Forest (with class weight)

In [None]:
rf = Pipeline([
    ('prep', preprocess),
    ('clf', RandomForestClassifier(n_estimators=400, random_state=42, class_weight='balanced_subsample', n_jobs=-1))
])
rf.fit(X_train, y_train)
proba_rf = rf.predict_proba(X_test)[:,1]
preds_rf = (proba_rf >= 0.5).astype(int)
auc_rf = roc_auc_score(y_test, proba_rf)
pr_auc_rf = average_precision_score(y_test, proba_rf)
print('RandomForest AUC:', round(auc_rf,3), 'PR-AUC:', round(pr_auc_rf,3))
print(classification_report(y_test, preds_rf))

## Curves & Feature Importance

In [None]:
def plot_roc_pr(y_true, proba, label_prefix):
    fpr, tpr, _ = roc_curve(y_true, proba)
    aps = average_precision_score(y_true, proba)
    prec, rec, _ = precision_recall_curve(y_true, proba)
    
    plt.figure()
    plt.plot(fpr, tpr)
    plt.plot([0,1],[0,1],'--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{label_prefix} ROC Curve (AUC={roc_auc_score(y_true, proba):.3f})')
    plt.savefig(os.path.join(SAVE_DIR, f'{label_prefix}_roc.png'), bbox_inches='tight')
    plt.show()

    plt.figure()
    plt.plot(rec, prec)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{label_prefix} Precision-Recall (AP={aps:.3f})')
    plt.savefig(os.path.join(SAVE_DIR, f'{label_prefix}_pr.png'), bbox_inches='tight')
    plt.show()

plot_roc_pr(y_test, proba_lr, 'logit')
plot_roc_pr(y_test, proba_rf, 'rf')

In [None]:
from sklearn.inspection import permutation_importance
# Permutation importance on RF pipeline
X_test_pre = rf.named_steps['prep'].transform(X_test)
rf_est = rf.named_steps['clf']
# Permutation importance requires a predict function; we'll wrap via full pipeline using a lambda
result = permutation_importance(rf, X_test, y_test, n_repeats=5, random_state=42, n_jobs=-1)
importances = result.importances_mean
# Extract feature names from preprocess
cat_feature_names = rf.named_steps['prep'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(cat_cols)
feature_names = np.r_[num_cols, cat_feature_names]
imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values('importance', ascending=False).head(20)
imp_df.to_csv(os.path.join(SAVE_DIR, 'top20_importances.csv'), index=False)
imp_df.head()

## Save Results

In [None]:
results = {
    'auc': {'logit': float(auc_lr), 'rf': float(auc_rf)},
    'pr_auc': {'logit': float(pr_auc_lr), 'rf': float(pr_auc_rf)}
}
with open(RESULTS_JSON, 'w') as f:
    json.dump(results, f, indent=2)
print(results)