# Credit Scoring – German Credit Data
Production-ready notebook with WoE, Logistic Regression, Threshold Optimization, and Evaluation

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import scorecardpy as sc

# ------------------------------
# 1️⃣ Load Data
# ------------------------------
col_names = [
    "account_status","duration","credit_history","purpose",
    "credit_amount","savings","employment","installment_rate",
    "personal_status","guarantors","residence","property",
    "age","other_installments","housing","credit_cards",
    "job","dependents","phone","foreign_worker","class"
]

df = pd.read_csv("german.data", sep=" ", header=None, names=col_names)
df.head()

In [ ]:
# ------------------------------
# 2️⃣ Data Sanity Checks
# ------------------------------
assert (df['duration'] > 0).all(), 'Duration must be positive'
assert (df['credit_amount'] > 0).all(), 'Credit amount must be positive'
assert (df['age'] > 0).all(), 'Age must be positive'

print(df.isnull().sum())

In [ ]:
# ------------------------------
# 3️⃣ Data Preparation - WoE
# ------------------------------
bins = sc.woebin(df, y='class')
df_woe = sc.woebin_ply(df, bins)

# Keep WoE columns + target
woe_cols = [col for col in df_woe.columns if '_woe' in col]
data_model = df_woe[woe_cols + ['class']]

# Feature importance (IV)
iv = sc.woebin_iv(bins).sort_values('info_value', ascending=False)
iv[['variable','info_value']]

In [ ]:
# ------------------------------
# 4️⃣ Train/Test Split
# ------------------------------
X = data_model.drop(columns=['class'])
y = (data_model['class'] == 2).astype(int)  # 1=Bad, 0=Good

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [ ]:
# ------------------------------
# 5️⃣ Logistic Regression Model
# ------------------------------
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_proba = lr.predict_proba(X_test)[:,1]

In [ ]:
# ------------------------------
# 6️⃣ Evaluation BEFORE Threshold Optimization
# ------------------------------
y_pred_default = (y_pred_proba >= 0.5).astype(int)
cm_default = confusion_matrix(y_test, y_pred_default)

plt.figure(figsize=(5,4))
sns.heatmap(cm_default, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Pred Good','Pred Bad'],
            yticklabels=['Actual Good','Actual Bad'])
plt.title('Confusion Matrix (Threshold=0.5)')
plt.show()

print(classification_report(y_test, y_pred_default, target_names=['Good','Bad']))

roc_auc_default = roc_auc_score(y_test, y_pred_proba)
gini_default = 2*roc_auc_default-1
print(f'ROC-AUC: {roc_auc_default:.3f}')
print(f'Gini Coefficient: {gini_default:.3f}')

In [ ]:
# ------------------------------
# 7️⃣ Threshold Optimization (Cost-Sensitive)
# ------------------------------
thresholds = np.arange(0, 1.01, 0.01)
costs = []
for t in thresholds:
    y_pred_thresh = (y_pred_proba >= t).astype(int)
    cm = confusion_matrix(y_test, y_pred_thresh)
    total_cost = cm[0,1]*1 + cm[1,0]*5  # Good->Bad=1, Bad->Good=5
    costs.append(total_cost)

optimal_idx = np.argmin(costs)
optimal_threshold = thresholds[optimal_idx]
min_cost = costs[optimal_idx]

print(f'Optimal Threshold: {optimal_threshold:.2f}')
print(f'Minimum Total Misclassification Cost: {min_cost}')

In [ ]:
# ------------------------------
# 8️⃣ Evaluation AFTER Threshold Optimization
# ------------------------------
y_pred_opt = (y_pred_proba >= optimal_threshold).astype(int)
cm_opt = confusion_matrix(y_test, y_pred_opt)

plt.figure(figsize=(5,4))
sns.heatmap(cm_opt, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Pred Good','Pred Bad'],
            yticklabels=['Actual Good','Actual Bad'])
plt.title(f'Confusion Matrix (Threshold={optimal_threshold:.2f})')
plt.show()

print(classification_report(y_test, y_pred_opt, target_names=['Good','Bad']))

roc_auc_opt = roc_auc_score(y_test, y_pred_proba)
gini_opt = 2*roc_auc_opt-1
print(f'ROC-AUC: {roc_auc_opt:.3f}')
print(f'Gini Coefficient: {gini_opt:.3f}')

# KS Statistic
df_ks = pd.DataFrame({'y_true': y_test, 'y_prob': y_pred_proba})
df_ks = df_ks.sort_values('y_prob', ascending=False)
df_ks['cum_good'] = (df_ks['y_true']==0).cumsum() / (df_ks['y_true']==0).sum()
df_ks['cum_bad']  = (df_ks['y_true']==1).cumsum() / (df_ks['y_true']==1).sum()
ks_stat = max(abs(df_ks['cum_bad'] - df_ks['cum_good']))
print(f'KS Statistic: {ks_stat:.3f}')

# KS Curve
plt.figure(figsize=(6,5))
plt.plot(df_ks['cum_good'], label='Cumulative Good')
plt.plot(df_ks['cum_bad'], label='Cumulative Bad')
plt.axhline(y=df_ks['cum_bad'].iloc[int(optimal_threshold*len(df_ks))],
            color='red', linestyle='--', label=f'Optimal Threshold={optimal_threshold:.2f}')
plt.title(f'KS Curve (KS={ks_stat:.3f})')
plt.xlabel('Sorted observations by predicted probability')
plt.ylabel('Cumulative distribution')
plt.legend()
plt.show()