# Credit Risk Model Validation

This notebook demonstrates the comprehensive validation process for credit risk models following regulatory guidelines and industry best practices. We'll cover:

1. Loading a previously developed model
2. Performance testing and benchmarking
3. Stability assessment
4. Sensitivity analysis
5. Out-of-time validation
6. Validation report generation

This validation framework is designed to align with regulatory requirements for model risk management in banking.

## Setup

In [None]:
# Import necessary libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
import pickle
from datetime import datetime

# Add the parent directory to path to import local modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import local modules
from src.data_processing.generate_synthetic_data import generate_credit_data
from src.data_processing.preprocess import preprocess_data, create_feature_pipeline
from src.model_development.models import CreditRiskModel
from src.model_validation.validator import ModelValidator, validate_model

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load configuration
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

## 1. Load Model and Data

First, we'll load the model we developed in the previous notebook, along with our datasets.

In [None]:
# Try to load a previously saved model, or train a new one if not available
model_type = 'gradient_boosting'  # Change this to match your best model from notebook 1
model_path = f'../models/credit_risk_{model_type}.pkl'

try:
    # Load the model if it exists
    credit_model = CreditRiskModel.load_model(model_path, model_type)
    print(f"Loaded model from {model_path}")
except (FileNotFoundError, pickle.UnpicklingError):
    print(f"Couldn't load model from {model_path}. Training a new model...")
    # Generate data and train a model
    data = generate_credit_data(n_samples=10000, random_seed=42)
    target_variable = config['data']['target_variable']
    
    # Split data
    train_ratio = 0.7
    train_size = int(len(data) * train_ratio)
    train_data = data.iloc[:train_size]
    test_data = data.iloc[train_size:]
    
    # Preprocess data
    pipeline = create_feature_pipeline(config, target_col=target_variable)
    X_train, y_train = preprocess_data(train_data, config, target_col=target_variable, is_training=True)
    X_test, y_test = preprocess_data(test_data, config, target_col=target_variable, is_training=False, preprocessing_pipeline=pipeline)
    
    # Train model
    from src.model_development.models import train_model
    model = train_model(X_train, y_train, model_type=model_type)
    credit_model = CreditRiskModel(model_type, model=model)
    
    # Save the model
    os.makedirs('../models', exist_ok=True)
    credit_model.save_model(model_path)
    print(f"Model saved to {model_path}")

In [None]:
# Load datasets
try:
    # Try to load saved datasets
    train_data = pd.read_csv('../data/credit_data_train.csv')
    test_data = pd.read_csv('../data/credit_data_test.csv')
    val_data = pd.read_csv('../data/credit_data_validation.csv')
    print("Loaded existing datasets")
except FileNotFoundError:
    # Generate new datasets if not found
    print("Generating new synthetic datasets")
    from src.data_processing.generate_synthetic_data import split_and_save_data
    data = generate_credit_data(n_samples=10000, random_seed=42)
    train_data, test_data, val_data = split_and_save_data(data, output_dir='../data')

# Display dataset sizes
print(f"Training data: {train_data.shape[0]} samples")
print(f"Test data: {test_data.shape[0]} samples")
print(f"Validation data: {val_data.shape[0]} samples")

In [None]:
# Prepare data for validation
target_variable = config['data']['target_variable']

# Create feature pipeline
pipeline = create_feature_pipeline(config, target_col=target_variable)

# Preprocess all datasets
X_train, y_train = preprocess_data(train_data, config, target_col=target_variable, is_training=True)
X_test, y_test = preprocess_data(test_data, config, target_col=target_variable, is_training=False, preprocessing_pipeline=pipeline)
X_val, y_val = preprocess_data(val_data, config, target_col=target_variable, is_training=False, preprocessing_pipeline=pipeline)

## 2. Create Model Validator

Now we'll create a validator object to perform comprehensive validation of our model.

In [None]:
# Initialize model validator
validator = ModelValidator(credit_model.model, config)

## 3. Performance Testing

First, let's evaluate the model's performance on different datasets.

In [None]:
# Run performance testing
performance_results = validator.performance_testing(X_train, y_train, X_test, y_test)

# Display results
print("Performance Testing Results:\n")
for dataset, metrics in performance_results.items():
    print(f"{dataset.upper()} SET METRICS:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")
    print()

In [None]:
# Assess discrimination metrics
discrimination_results = validator.assess_discrimination(X_test, y_test)

# Plot ROC curve
plt.figure(figsize=(10, 8))
plt.plot(discrimination_results['fpr'], discrimination_results['tpr'], 
         label=f"ROC Curve (AUC = {discrimination_results['auc']:.4f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Print KS statistic
print(f"KS Statistic: {discrimination_results['ks_statistic']:.4f}")
print(f"Gini Coefficient: {discrimination_results['gini']:.4f}")

In [None]:
# Assess calibration
calibration_results = validator.assess_calibration(X_test, y_test)

# Plot calibration curve
plt.figure(figsize=(10, 8))
plt.plot(calibration_results['mean_predicted_probs'], calibration_results['observed_probs'], 'o-', 
         label=f"Calibration Curve (Brier Score = {calibration_results['brier_score']:.4f})")
plt.plot([0, 1], [0, 1], 'k--', label='Perfect Calibration')
plt.xlabel('Mean Predicted Probability')
plt.ylabel('Observed Probability')
plt.title('Calibration Plot')
plt.legend()
plt.grid(True)
plt.show()

# Display expected calibration error
print(f"Expected Calibration Error: {calibration_results['calibration_error']:.4f}")

## 4. Benchmark Comparison

Now let's compare our model against simpler benchmark models to ensure it adds sufficient value.

In [None]:
# Run benchmark comparison
benchmark_results = validator.benchmark_comparison(X_test, y_test)

# Display benchmark results
benchmark_df = pd.DataFrame(benchmark_results['comparison'])
benchmark_df

In [None]:
# Plot benchmark comparison
metrics_to_plot = ['auc', 'accuracy', 'f1_score']
benchmark_data = benchmark_df[['model'] + metrics_to_plot]

fig, axes = plt.subplots(1, len(metrics_to_plot), figsize=(15, 5))

for i, metric in enumerate(metrics_to_plot):
    ax = axes[i]
    sns.barplot(x='model', y=metric, data=benchmark_data, ax=ax)
    ax.set_title(f'Comparison of {metric.upper()}')
    ax.set_xlabel('')
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
    
plt.tight_layout()
plt.show()

## 5. Stability Testing

Next, we'll assess the stability of the model's performance across different datasets.

In [None]:
# Run stability testing
stability_results = validator.stability_testing(X_train, y_train, X_test, y_test)

# Display PSI for each feature
print("Population Stability Index (PSI) for each feature:")
psi_df = pd.DataFrame(stability_results['feature_psi'].items(), columns=['Feature', 'PSI'])
psi_df = psi_df.sort_values('PSI', ascending=False)
psi_df

In [None]:
# Plot PSI for top features
plt.figure(figsize=(12, 8))
top_psi = psi_df.head(15)
bars = plt.bar(top_psi['Feature'], top_psi['PSI'])

# Color bars based on PSI thresholds
for i, bar in enumerate(bars):
    psi_value = top_psi.iloc[i]['PSI']
    if psi_value < 0.1:
        bar.set_color('green')
    elif psi_value < 0.2:
        bar.set_color('orange')
    else:
        bar.set_color('red')

plt.axhline(y=0.1, color='green', linestyle='--', label='Low Shift (PSI=0.1)')
plt.axhline(y=0.2, color='red', linestyle='--', label='High Shift (PSI=0.2)')
plt.title('Population Stability Index (PSI) by Feature')
plt.xlabel('Feature')
plt.ylabel('PSI Value')
plt.xticks(rotation=90)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Display performance stability
print("Performance Stability Metrics:")
for metric, value in stability_results['performance_stability'].items():
    print(f"{metric}: {value:.4f}")

## 6. Sensitivity Analysis

Now we'll analyze how sensitive the model is to changes in key input variables.

In [None]:
# Run sensitivity analysis
sensitivity_results = validator.sensitivity_analysis(X_test, y_test)

# Display feature sensitivity
sensitivity_df = pd.DataFrame(sensitivity_results['feature_sensitivity'])
sensitivity_df.sort_values('sensitivity_score', ascending=False).head(10)

In [None]:
# Plot sensitivity for top features
top_sensitivity = sensitivity_df.sort_values('sensitivity_score', ascending=False).head(10)

plt.figure(figsize=(12, 8))
bars = plt.barh(top_sensitivity['feature'], top_sensitivity['sensitivity_score'])

# Color bars based on sensitivity thresholds
for i, bar in enumerate(bars):
    score = top_sensitivity.iloc[i]['sensitivity_score']
    if score < 0.1:
        bar.set_color('green')
    elif score < 0.3:
        bar.set_color('orange')
    else:
        bar.set_color('red')

plt.title('Feature Sensitivity Analysis')
plt.xlabel('Sensitivity Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
# Detailed plot for most sensitive feature
most_sensitive_feature = sensitivity_df.iloc[sensitivity_df['sensitivity_score'].idxmax()]['feature']
feature_detail = sensitivity_results['detailed_sensitivity'][most_sensitive_feature]

plt.figure(figsize=(10, 6))
plt.plot(feature_detail['perturbation_factors'], feature_detail['predictions'], marker='o')
plt.title(f'Sensitivity Detail for {most_sensitive_feature}')
plt.xlabel('Perturbation Factor')
plt.ylabel('Average Predicted Probability')
plt.grid(True)
plt.show()

## 7. Validate Out-of-Time Performance

To simulate out-of-time validation, we'll create a dataset with time-based drift and evaluate performance.

In [None]:
# Generate synthetic data with drift to simulate future data
def generate_future_data(n_samples=2000, base_data=None, drift_factor=1.2):
    """Generate data with drift to simulate future data."""
    if base_data is None:
        future_data = generate_credit_data(n_samples=n_samples, random_seed=100)  # Different seed
    else:
        # Create copy of base data
        future_data = base_data.sample(n_samples, replace=True).reset_index(drop=True)
        
        # Apply drift to numeric features
        numeric_cols = future_data.select_dtypes(include=['float64', 'int64']).columns
        target_col = config['data']['target_variable']
        numeric_features = [col for col in numeric_cols if col != target_col]
        
        # Apply drift to specific features
        if 'income' in future_data.columns:
            future_data['income'] = future_data['income'] * drift_factor
            
        if 'debt_to_income' in future_data.columns:
            future_data['debt_to_income'] = future_data['debt_to_income'] * drift_factor * 0.9
        
        # Apply some random noise to all numeric features
        for col in numeric_features:
            noise = np.random.normal(1, 0.1, size=len(future_data))
            future_data[col] = future_data[col] * noise
    
    return future_data

# Generate future data
future_data = generate_future_data(n_samples=2000, base_data=data, drift_factor=1.2)

# Preprocess future data
X_future, y_future = preprocess_data(
    future_data, 
    config, 
    target_col=target_variable, 
    is_training=False, 
    preprocessing_pipeline=pipeline
)

In [None]:
# Evaluate model on future data
future_metrics = credit_model.evaluate(X_future, y_future)
print("Future Data Performance:")
for metric, value in future_metrics.items():
    print(f"{metric}: {value:.4f}")
    
# Compare with test data performance
test_metrics = credit_model.evaluate(X_test, y_test)
print("\nPerformance Delta (Future - Test):")
for metric in future_metrics.keys():
    delta = future_metrics[metric] - test_metrics[metric]
    print(f"{metric}: {delta:.4f}")

In [None]:
# Calculate PSI between test and future data
from src.model_validation.validator import calculate_psi

# Get model predictions
test_probs = credit_model.predict_proba(X_test)
future_probs = credit_model.predict_proba(X_future)

# Calculate PSI for score distributions
score_psi = calculate_psi(pd.Series(test_probs), pd.Series(future_probs))
print(f"PSI for score distributions: {score_psi:.4f}")

# Interpret PSI
if score_psi < 0.1:
    interpretation = "No significant shift in score distribution"
elif score_psi < 0.2:
    interpretation = "Moderate shift in score distribution"
else:
    interpretation = "Significant shift in score distribution - model may need to be retrained"
    
print(f"Interpretation: {interpretation}")

In [None]:
# Plot score distributions
plt.figure(figsize=(12, 6))
plt.hist(test_probs, bins=20, alpha=0.5, label='Test Data Scores')
plt.hist(future_probs, bins=20, alpha=0.5, label='Future Data Scores')
plt.title(f'Score Distribution Comparison (PSI = {score_psi:.4f})')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 8. Generate Validation Report

Finally, let's generate a comprehensive validation report.

In [None]:
# Run full validation and generate report
validation_results = validate_model(
    credit_model.model, 
    X_train, y_train, 
    X_test, y_test, 
    X_val=X_val, y_val=y_val, 
    config=config
)

# Create validation output directory
validation_dir = '../reports/validation'
os.makedirs(validation_dir, exist_ok=True)

# Generate report
report_path = os.path.join(validation_dir, f'validation_report_{datetime.now().strftime("%Y%m%d")}.md')
validator.generate_report(report_path)

print(f"Validation report generated at: {report_path}")

# Generate validation visualizations
validator.plot_validation_results(validation_dir)
print(f"Validation visualizations saved to: {validation_dir}")

## 9. Summary

In this notebook, we've conducted a comprehensive validation of our credit risk model following regulatory guidelines:

1. We assessed the model's performance using appropriate discrimination and calibration metrics
2. We compared the model against simpler benchmarks to ensure it adds value
3. We evaluated the model's stability across different datasets
4. We performed sensitivity analysis to understand how the model responds to changes in input variables
5. We simulated out-of-time validation to assess the model's robustness to data drift
6. We generated a comprehensive validation report

This validation framework helps ensure that our model is accurate, stable, and robust, meeting the requirements for model risk management in a banking environment.