# Credit Risk Scorecard Development and Validation

This notebook demonstrates the end-to-end process of developing and validating a credit risk scorecard model. The process includes:

1. Data Loading and Inspection
2. Data Cleaning and Transformation
3. Exploratory Data Analysis
4. Feature Engineering and Selection
5. Model Development
6. Model Validation and Assessment

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.feature_selection import RFE
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(0)
plt.style.use('seaborn')

## 1. Data Loading and Inspection

In [None]:
# Load the data
df = pd.read_excel('Home_Credit_Risk_Business_Analyst_Test.xlsx')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nDataset Info:")
df.info()

# Display summary statistics
print("\nSummary Statistics:")
df.describe(include='all').T

## 2. Data Cleaning and Transformation

In [None]:
def clean_and_transform_data(df):
    # Create a copy of the dataframe
    df_clean = df.copy()
    
    # Check for missing values
    print("Missing Values:")
    print(df_clean.isnull().sum())
    
    # Handle missing values if any
    # For numeric columns, fill with median
    numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
    for col in numeric_cols:
        df_clean[col].fillna(df_clean[col].median(), inplace=True)
    
    # For categorical columns, fill with mode
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)
    
    # Create new features
    # Debt to Income Ratio
    df_clean['debt_to_income'] = df_clean['loan_amnt'] / df_clean['person_income']
    
    # Age groups
    df_clean['age_group'] = pd.cut(df_clean['person_age'], 
                                   bins=[0, 25, 35, 45, 55, 100],
                                   labels=['<25', '25-35', '35-45', '45-55', '55+'])
    
    # Income groups (quartiles)
    df_clean['income_group'] = pd.qcut(df_clean['person_income'], q=4, 
                                       labels=['Low', 'Medium-Low', 'Medium-High', 'High'])
    
    # Encode categorical variables
    le = LabelEncoder()
    for col in categorical_cols:
        df_clean[f'{col}_encoded'] = le.fit_transform(df_clean[col])
    
    return df_clean

# Clean and transform the data
df_clean = clean_and_transform_data(df)
print("\nTransformed Dataset Shape:", df_clean.shape)

## 3. Exploratory Data Analysis

In [None]:
def perform_eda(df):
    # Default Rate Analysis
    plt.figure(figsize=(10, 6))
    df['loan_status'].value_counts(normalize=True).plot(kind='bar')
    plt.title('Default Rate Distribution')
    plt.xlabel('Loan Status (1 = Default)')
    plt.ylabel('Proportion')
    plt.show()
    
    # Default Rate by Loan Grade
    plt.figure(figsize=(10, 6))
    default_by_grade = df.groupby('loan_grade')['loan_status'].mean()
    default_by_grade.plot(kind='bar')
    plt.title('Default Rate by Loan Grade')
    plt.xlabel('Loan Grade')
    plt.ylabel('Default Rate')
    plt.show()
    
    # Income Distribution by Default Status
    plt.figure(figsize=(12, 6))
    sns.boxplot(x='loan_status', y='person_income', data=df)
    plt.title('Income Distribution by Default Status')
    plt.show()
    
    # Correlation Analysis
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    correlation_matrix = df[numeric_cols].corr()
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.show()
    
    # Default Rate by Home Ownership
    plt.figure(figsize=(10, 6))
    df.groupby('person_home_ownership')['loan_status'].mean().plot(kind='bar')
    plt.title('Default Rate by Home Ownership')
    plt.xlabel('Home Ownership')
    plt.ylabel('Default Rate')
    plt.show()

# Perform EDA
perform_eda(df_clean)

## 4. Feature Engineering and Selection

In [None]:
def select_features(df):
    # Prepare features for selection
    # Select numerical and encoded categorical columns
    feature_cols = [col for col in df.columns if col.endswith('_encoded') or 
                   (df[col].dtype in ['int64', 'float64'] and 
                    col not in ['person_id', 'loan_status'])]
    
    X = df[feature_cols]
    y = df['loan_status']
    
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)
    
    # Use Recursive Feature Elimination with Logistic Regression
    model = LogisticRegression(random_state=0)
    rfe = RFE(estimator=model, n_features_to_select=10)
    rfe = rfe.fit(X_scaled, y)
    
    # Get selected features
    selected_features = X.columns[rfe.support_]
    
    print("Selected Features:")
    for i, feature in enumerate(selected_features, 1):
        print(f"{i}. {feature}")
    
    return selected_features

# Select features
selected_features = select_features(df_clean)

## 5. Model Development

In [None]:
def develop_scorecard(df, selected_features):
    # Prepare data
    X = df[selected_features]
    y = df['loan_status']
    
    # Split data into train, validation, and test sets (70/20/10)
    X_train_temp, X_test, y_train_temp, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
    X_train, X_val, y_train, y_val = train_test_split(X_train_temp, y_train_temp, test_size=0.22222, random_state=0)
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Train logistic regression model
    model = LogisticRegression(random_state=0)
    model.fit(X_train_scaled, y_train)
    
    # Calculate scores
    def calculate_score(probability):
        # Convert probability to score (typical range: 300-850)
        return int(600 - np.log(probability / (1 - probability)) * 50)
    
    # Generate scores for all sets
    train_probs = model.predict_proba(X_train_scaled)[:, 1]
    val_probs = model.predict_proba(X_val_scaled)[:, 1]
    test_probs = model.predict_proba(X_test_scaled)[:, 1]
    
    train_scores = np.array([calculate_score(p) for p in train_probs])
    val_scores = np.array([calculate_score(p) for p in val_probs])
    test_scores = np.array([calculate_score(p) for p in test_probs])
    
    return model, scaler, (train_scores, val_scores, test_scores), (y_train, y_val, y_test)

# Develop scorecard
model, scaler, (train_scores, val_scores, test_scores), (y_train, y_val, y_test) = develop_scorecard(df_clean, selected_features)

## 6. Model Validation and Assessment

In [None]:
def validate_model(train_scores, val_scores, test_scores, y_train, y_val, y_test):
    def calculate_metrics(scores, y_true):
        # Convert scores to probabilities (inverse of score calculation)
        probs = 1 / (1 + np.exp((600 - scores) / 50))
        
        # Calculate AUC-ROC
        auc = roc_auc_score(y_true, probs)
        
        # Calculate KS statistic
        ks_stat, _ = stats.ks_2samp(scores[y_true == 0], scores[y_true == 1])
        
        return auc, ks_stat
    
    # Calculate metrics for all sets
    train_auc, train_ks = calculate_metrics(train_scores, y_train)
    val_auc, val_ks = calculate_metrics(val_scores, y_val)
    test_auc, test_ks = calculate_metrics(test_scores, y_test)
    
    print("Model Performance Metrics:")
    print(f"{'Dataset':<10} {'AUC-ROC':>10} {'KS-Stat':>10}")
    print("-" * 32)
    print(f"{'Train':<10} {train_auc:>10.3f} {train_ks:>10.3f}")
    print(f"{'Validation':<10} {val_auc:>10.3f} {val_ks:>10.3f}")
    print(f"{'Test':<10} {test_auc:>10.3f} {test_ks:>10.3f}")
    
    # Plot score distributions
    plt.figure(figsize=(12, 6))
    plt.hist(test_scores[y_test == 0], bins=50, alpha=0.5, label='Non-Default', density=True)
    plt.hist(test_scores[y_test == 1], bins=50, alpha=0.5, label='Default', density=True)
    plt.title('Score Distribution by Default Status (Test Set)')
    plt.xlabel('Score')
    plt.ylabel('Density')
    plt.legend()
    plt.show()
    
    # Population Stability Index (PSI)
    def calculate_psi(expected, actual, bins=10):
        # Calculate PSI between two score distributions
        breaks = np.percentile(np.concatenate([expected, actual]), 
                              np.linspace(0, 100, bins+1))
        expected_dist = np.histogram(expected, bins=breaks)[0] / len(expected)
        actual_dist = np.histogram(actual, bins=breaks)[0] / len(actual)
        
        # Avoid division by zero
        expected_dist = np.where(expected_dist == 0, 0.0001, expected_dist)
        actual_dist = np.where(actual_dist == 0, 0.0001, actual_dist)
        
        psi = np.sum((actual_dist - expected_dist) * np.log(actual_dist / expected_dist))
        return psi
    
    # Calculate PSI
    train_val_psi = calculate_psi(train_scores, val_scores)
    train_test_psi = calculate_psi(train_scores, test_scores)
    
    print("\nPopulation Stability Index (PSI):")
    print(f"Train vs Validation: {train_val_psi:.3f}")
    print(f"Train vs Test: {train_test_psi:.3f}")

# Validate model
validate_model(train_scores, val_scores, test_scores, y_train, y_val, y_test)