# Credit Risk Model Development

This notebook demonstrates the end-to-end process of developing a credit risk model using the RiskModel framework. We'll cover:

1. Generating synthetic credit data
2. Preprocessing and feature engineering
3. Model training and hyperparameter tuning
4. Model evaluation

This workflow follows industry best practices for developing credit risk models in a banking environment.

## Setup

In [None]:
# Import necessary libraries
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

# Add the parent directory to path to import local modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Import local modules
from src.data_processing.generate_synthetic_data import generate_credit_data, split_and_save_data
from src.data_processing.preprocess import preprocess_data, create_feature_pipeline
from src.model_development.models import train_model, compare_models, CreditRiskModel

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load configuration
with open('../config.yaml', 'r') as f:
    config = yaml.safe_load(f)

## 1. Data Generation

For this demonstration, we'll generate synthetic credit data that mimics real-world loan applications and default patterns. In a real-world scenario, you would use historical customer data from your bank's systems.

In [None]:
# Generate synthetic credit data
print("Generating synthetic credit data...")
data = generate_credit_data(n_samples=10000, random_seed=42)

# Display data sample
data.head()

In [None]:
# Quick data exploration
print("Data shape:", data.shape)
print("\nDefault rate:")
print(data['default_flag'].value_counts(normalize=True))

# Check data types
print("\nData types:")
print(data.dtypes)

# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

In [None]:
# Basic EDA - Numerical features distributions
numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns
numerical_cols = [col for col in numerical_cols if col != 'default_flag']

# Plot distribution of numerical features
fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.flatten()

for i, col in enumerate(numerical_cols[:9]):
    sns.histplot(data=data, x=col, hue='default_flag', kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {col}')
    
plt.tight_layout()
plt.show()

In [None]:
# Basic EDA - Categorical features
categorical_cols = data.select_dtypes(include=['object', 'category']).columns

fig, axes = plt.subplots(len(categorical_cols), 1, figsize=(12, 4*len(categorical_cols)))
if len(categorical_cols) == 1:
    axes = [axes]

for i, col in enumerate(categorical_cols):
    # Create a cross-tabulation of the categorical feature vs default flag
    ct = pd.crosstab(data[col], data['default_flag'], normalize='index')
    ct.plot(kind='bar', stacked=True, ax=axes[i])
    axes[i].set_title(f'Default Rate by {col}')
    axes[i].set_ylabel('Proportion')
    axes[i].set_xlabel(col)
    
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 10))
corr_matrix = data.select_dtypes(include=['float64', 'int64']).corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Matrix of Numeric Features')
plt.show()

## 2. Data Preprocessing

Now we'll preprocess the data using our framework's standardized techniques:

In [None]:
# Split data into train, test, and validation sets
target_variable = config['data']['target_variable']
train_data, test_data, val_data = split_and_save_data(data, output_dir='../data', test_size=0.2, validation_size=0.1)

# Display split sizes
print(f"Training data: {train_data.shape[0]} samples")
print(f"Test data: {test_data.shape[0]} samples")
print(f"Validation data: {val_data.shape[0]} samples")

In [None]:
# Create preprocessing pipeline
preprocessing_pipeline = create_feature_pipeline(config, target_col=target_variable)

# Preprocess training data
X_train, y_train = preprocess_data(
    train_data,
    config,
    target_col=target_variable,
    is_training=True,
    preprocessing_pipeline=preprocessing_pipeline
)

# Preprocess test data
X_test, y_test = preprocess_data(
    test_data,
    config,
    target_col=target_variable, 
    is_training=False, 
    preprocessing_pipeline=preprocessing_pipeline
)

# Preprocess validation data
X_val, y_val = preprocess_data(
    val_data,
    config,
    target_col=target_variable, 
    is_training=False, 
    preprocessing_pipeline=preprocessing_pipeline
)

# Display preprocessed feature sample
X_train.head()

## 3. Model Development

Now we'll train and compare different model types to find the most suitable one for our credit risk assessment task.

In [None]:
# Compare multiple model types
model_types = ['logistic_regression', 'random_forest', 'gradient_boosting']
model_results = compare_models(X_train, y_train, X_test, y_test, model_types=model_types)

# Display comparison results
model_results

In [None]:
# Train the best model with hyperparameter tuning
best_model_type = model_results.iloc[model_results['roc_auc'].idxmax()]['model_type']
print(f"Best model type: {best_model_type}")

# Train the model with hyperparameter tuning
tuned_model = train_model(X_train, y_train, model_type=best_model_type, tune=True)

# Create model instance using our framework
credit_model = CreditRiskModel(best_model_type, model=tuned_model)

## 4. Model Evaluation

Let's evaluate our model's performance using various metrics and visualizations:

In [None]:
# Evaluate on test data
test_metrics = credit_model.evaluate(X_test, y_test)
print("Test set metrics:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.4f}")

In [None]:
# Plot ROC curve
plt.figure(figsize=(10, 6))
ax = plt.gca()
credit_model.plot_roc_curve(X_test, y_test, ax=ax)
plt.title('ROC Curve on Test Data')
plt.show()

In [None]:
# Plot precision-recall curve
plt.figure(figsize=(10, 6))
ax = plt.gca()
credit_model.plot_precision_recall_curve(X_test, y_test, ax=ax)
plt.title('Precision-Recall Curve on Test Data')
plt.show()

In [None]:
# Plot feature importance
if hasattr(credit_model.model, 'feature_importances_') or best_model_type == 'logistic_regression':
    plt.figure(figsize=(12, 8))
    ax = plt.gca()
    credit_model.plot_feature_importance(top_n=20, ax=ax)
    plt.title('Top 20 Feature Importances')
    plt.show()

## 5. Save the Model

Finally, we'll save our trained model for later use in validation and monitoring.

In [None]:
# Create directory for models if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Save the model
model_path = f'../models/credit_risk_{best_model_type}.pkl'
credit_model.save_model(model_path)
print(f"Model saved to {model_path}")

## 6. Summary

In this notebook, we've demonstrated the process of developing a credit risk model following industry best practices:

1. We generated and explored synthetic credit data
2. We prepared the data using standardized preprocessing techniques
3. We trained multiple model types and selected the best performer
4. We evaluated the model using appropriate metrics for credit risk assessment
5. We saved the model for future use

In the next notebook, we'll cover the model validation process, including performance testing, stability assessment, and sensitivity analysis.