# Phase 10: Testing & Validation

## AI-Powered Mortgage Underwriting Assistant

**Objective**: Comprehensive validation of the final model against success criteria, system integration testing, and error analysis.

### Tasks:
- **Task 10.1**: Model Validation - Final test set evaluation against success criteria
- **Task 10.2**: System Testing - End-to-end API/Dashboard integration and performance testing
- **Task 10.3**: Error Analysis - False positive/negative analysis and model limitations

### Success Criteria:
- AUC-ROC ‚â• 0.75
- Precision ‚â• 0.80
- Recall ‚â• 0.70
- API Response Time < 500ms
- Fairness: Demographic Parity Ratio ‚â• 0.80

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
import time
import requests
from pathlib import Path
from datetime import datetime

# Scikit-learn metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    confusion_matrix, classification_report,
    average_precision_score, brier_score_loss
)

# Fairness
from fairlearn.metrics import (
    demographic_parity_ratio,
    equalized_odds_ratio,
    MetricFrame
)

# Set paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
DATA_DIR = PROJECT_ROOT / 'data' / 'processed'
MODELS_DIR = PROJECT_ROOT / 'models'
RESULTS_DIR = PROJECT_ROOT / 'results'

# Plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print(f"Project Root: {PROJECT_ROOT}")
print(f"Data Directory: {DATA_DIR}")
print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

Project Root: /Users/josiahgordor/Desktop/DSPortfolio/Projects/loan_approval
Data Directory: /Users/josiahgordor/Desktop/DSPortfolio/Projects/loan_approval/data/processed
Timestamp: 2026-02-17 13:38:58


---
## Task 10.1: Model Validation

### 10.1.1 Load Held-Out Test Set and Fair Models

In [None]:
# Load test data
test_df = pd.read_csv(DATA_DIR / 'test.csv')
print(f"Test set size: {len(test_df):,} samples")

# Identify target and features
target_col = 'loan_approved'
if target_col not in test_df.columns:
    target_col = 'action_taken_binary'
    if target_col not in test_df.columns:
        # Find target column
        potential_targets = [c for c in test_df.columns if 'target' in c.lower() or 'approved' in c.lower() or 'action' in c.lower()]
        print(f"Potential target columns: {potential_targets}")
        target_col = potential_targets[0] if potential_targets else test_df.columns[-1]

print(f"Target column: {target_col}")
print(f"Target distribution:\n{test_df[target_col].value_counts(normalize=True)}")

In [None]:
# Load fair representation components
fair_rep_dir = MODELS_DIR / 'fair_representation'
fair_models_dir = MODELS_DIR / 'fair_models'

# Load scaler
scaler = joblib.load(fair_rep_dir / 'fair_scaler.pkl')
print("‚úÖ Scaler loaded")

# Load encoder
from tensorflow import keras
try:
    encoder = keras.models.load_model(fair_rep_dir / 'fair_encoder.keras')
    print("‚úÖ Encoder loaded")
except Exception as e:
    print(f"‚ö†Ô∏è Encoder load error: {e}")
    encoder = None

# Load metadata
with open(fair_rep_dir / 'fair_representation_metadata.json', 'r') as f:
    metadata = json.load(f)

selected_features = metadata.get('selected_features', [])[:metadata.get('input_dim', 32)]
input_dim = metadata.get('input_dim', 32)
print(f"Input dimension: {input_dim}")
print(f"Selected features: {len(selected_features)}")

In [None]:
# Load all fair models for comparison
fair_models = {}

model_files = {
    'XGB_Fair': 'xgb_fair.pkl',
    'RF_Fair': 'rf_fair.pkl',
    'LR_Fair': 'lr_fair.pkl',
    'GLM_Fair': 'glm_fair.pkl'
}

for name, filename in model_files.items():
    filepath = fair_models_dir / filename
    if filepath.exists():
        fair_models[name] = joblib.load(filepath)
        print(f"‚úÖ Loaded {name}")
    else:
        print(f"‚ö†Ô∏è {name} not found")

print(f"\nTotal models loaded: {len(fair_models)}")

In [None]:
# Prepare test features
# Get feature columns (present in test_df and in selected_features)
available_features = [f for f in selected_features if f in test_df.columns][:input_dim]

# If not enough features, add more from test_df
if len(available_features) < input_dim:
    numeric_cols = test_df.select_dtypes(include=[np.number]).columns.tolist()
    for col in numeric_cols:
        if col not in available_features and col != target_col:
            available_features.append(col)
        if len(available_features) >= input_dim:
            break

available_features = available_features[:input_dim]
print(f"Using {len(available_features)} features for testing")

# Prepare X and y
X_test = test_df[available_features].fillna(0).values
y_test = test_df[target_col].values

print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

In [None]:
# Transform through fair representation pipeline
X_test_scaled = scaler.transform(X_test)

if encoder is not None:
    X_test_latent = encoder.predict(X_test_scaled, verbose=0)
    print(f"X_test_latent shape: {X_test_latent.shape}")
else:
    X_test_latent = X_test_scaled
    print("Using scaled features (no encoder)")

### 10.1.2 Final Evaluation Against Success Criteria

In [None]:
# Define success criteria
SUCCESS_CRITERIA = {
    'AUC-ROC': 0.75,
    'Precision': 0.80,
    'Recall': 0.70,
    'F1-Score': 0.70  # Derived from precision/recall requirements
}

print("Success Criteria:")
for metric, threshold in SUCCESS_CRITERIA.items():
    print(f"  {metric}: ‚â• {threshold}")

In [None]:
# Evaluate all models against success criteria
results = []

for model_name, model in fair_models.items():
    # Get predictions
    y_pred = model.predict(X_test_latent)
    y_prob = model.predict_proba(X_test_latent)[:, 1]
    
    # Calculate metrics
    auc = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Check against criteria
    auc_pass = auc >= SUCCESS_CRITERIA['AUC-ROC']
    precision_pass = precision >= SUCCESS_CRITERIA['Precision']
    recall_pass = recall >= SUCCESS_CRITERIA['Recall']
    all_pass = auc_pass and precision_pass and recall_pass
    
    results.append({
        'Model': model_name,
        'AUC-ROC': auc,
        'AUC_Pass': '‚úÖ' if auc_pass else '‚ùå',
        'Precision': precision,
        'Prec_Pass': '‚úÖ' if precision_pass else '‚ùå',
        'Recall': recall,
        'Rec_Pass': '‚úÖ' if recall_pass else '‚ùå',
        'F1-Score': f1,
        'Accuracy': accuracy,
        'All_Criteria_Met': '‚úÖ PASS' if all_pass else '‚ùå FAIL'
    })

results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("FINAL MODEL EVALUATION AGAINST SUCCESS CRITERIA")
print("="*80)
display(results_df.style.format({
    'AUC-ROC': '{:.4f}',
    'Precision': '{:.4f}',
    'Recall': '{:.4f}',
    'F1-Score': '{:.4f}',
    'Accuracy': '{:.4f}'
}))

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

metrics_to_plot = ['AUC-ROC', 'Precision', 'Recall']
thresholds = [SUCCESS_CRITERIA['AUC-ROC'], SUCCESS_CRITERIA['Precision'], SUCCESS_CRITERIA['Recall']]

for ax, metric, threshold in zip(axes, metrics_to_plot, thresholds):
    values = results_df[metric].values
    colors = ['green' if v >= threshold else 'red' for v in values]
    
    bars = ax.barh(results_df['Model'], values, color=colors, alpha=0.7)
    ax.axvline(x=threshold, color='black', linestyle='--', linewidth=2, label=f'Threshold ({threshold})')
    ax.set_xlabel(metric)
    ax.set_title(f'{metric} vs Success Criteria')
    ax.legend()
    ax.set_xlim(0, 1)
    
    # Add value labels
    for bar, val in zip(bars, values):
        ax.text(val + 0.02, bar.get_y() + bar.get_height()/2, f'{val:.3f}', va='center')

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'success_criteria_evaluation.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Select best model (recommended: XGB_Fair)
best_model_name = 'XGB_Fair'
best_model = fair_models.get(best_model_name, list(fair_models.values())[0])

print(f"\n{'='*60}")
print(f"BEST MODEL: {best_model_name}")
print(f"{'='*60}")

# Detailed evaluation
y_pred_best = best_model.predict(X_test_latent)
y_prob_best = best_model.predict_proba(X_test_latent)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred_best, target_names=['Denied', 'Approved']))

In [None]:
# ROC Curve for best model
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_prob_best)
auc_score = roc_auc_score(y_test, y_prob_best)

axes[0].plot(fpr, tpr, 'b-', linewidth=2, label=f'{best_model_name} (AUC = {auc_score:.4f})')
axes[0].plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random Classifier')
axes[0].axhline(y=0.70, color='green', linestyle=':', label='Recall Threshold (0.70)')
axes[0].fill_between(fpr, tpr, alpha=0.2)
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curve - Final Model')
axes[0].legend(loc='lower right')
axes[0].grid(True, alpha=0.3)

# Precision-Recall Curve
precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_prob_best)
ap = average_precision_score(y_test, y_prob_best)

axes[1].plot(recall_vals, precision_vals, 'g-', linewidth=2, label=f'{best_model_name} (AP = {ap:.4f})')
axes[1].axhline(y=0.80, color='red', linestyle=':', label='Precision Threshold (0.80)')
axes[1].axvline(x=0.70, color='blue', linestyle=':', label='Recall Threshold (0.70)')
axes[1].fill_between(recall_vals, precision_vals, alpha=0.2, color='green')
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curve - Final Model')
axes[1].legend(loc='lower left')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'final_model_curves.png', dpi=150, bbox_inches='tight')
plt.show()

### 10.1.3 Business Validation - Edge Cases

In [None]:
# Define business edge cases
edge_cases = [
    {
        'name': 'High LTV, Low Income',
        'loan_amount': 380000,
        'property_value': 400000,  # LTV = 95%
        'income': 50000,  # LTI = 7.6x
        'interest_rate': 7.5,
        'expected': 'Denied',
        'reason': 'High risk - LTV > 90% and LTI > 6x'
    },
    {
        'name': 'Low LTV, High Income',
        'loan_amount': 200000,
        'property_value': 500000,  # LTV = 40%
        'income': 150000,  # LTI = 1.3x
        'interest_rate': 6.0,
        'expected': 'Approved',
        'reason': 'Low risk - strong equity and income'
    },
    {
        'name': 'FHA First-Time Buyer',
        'loan_amount': 250000,
        'property_value': 270000,  # LTV = 92.6%
        'income': 65000,  # LTI = 3.8x
        'interest_rate': 6.5,
        'is_fha_loan': True,
        'expected': 'Approved',
        'reason': 'FHA allows higher LTV for qualified buyers'
    },
    {
        'name': 'VA Loan - Veteran',
        'loan_amount': 400000,
        'property_value': 400000,  # LTV = 100%
        'income': 100000,  # LTI = 4x
        'interest_rate': 5.5,
        'is_va_loan': True,
        'expected': 'Approved',
        'reason': 'VA loans allow 100% LTV'
    },
    {
        'name': 'Jumbo Loan',
        'loan_amount': 1500000,
        'property_value': 2000000,  # LTV = 75%
        'income': 400000,  # LTI = 3.75x
        'interest_rate': 7.0,
        'expected': 'Approved',
        'reason': 'Strong metrics despite jumbo size'
    },
    {
        'name': 'Borderline Case',
        'loan_amount': 300000,
        'property_value': 375000,  # LTV = 80%
        'income': 75000,  # LTI = 4x
        'interest_rate': 6.75,
        'expected': 'Moderate Risk',
        'reason': 'On the edge - needs manual review'
    }
]

print(f"Testing {len(edge_cases)} business edge cases...")

In [None]:
# Test edge cases against API (if available)
API_URL = 'http://localhost:8000'

def test_edge_case_api(case):
    """Test an edge case against the API."""
    payload = {
        'loan_amount': case['loan_amount'],
        'property_value': case['property_value'],
        'income': case['income'],
        'interest_rate': case['interest_rate'],
        'loan_term': 360,
        'is_fha_loan': case.get('is_fha_loan', False),
        'is_va_loan': case.get('is_va_loan', False)
    }
    
    try:
        response = requests.post(f"{API_URL}/predict", json=payload, timeout=5)
        if response.status_code == 200:
            return response.json()
        else:
            return {'error': f'Status {response.status_code}'}
    except requests.exceptions.RequestException as e:
        return {'error': str(e)}

# Check if API is available
api_available = False
try:
    health = requests.get(f"{API_URL}/health", timeout=2)
    api_available = health.status_code == 200
except:
    pass

print(f"API Available: {'‚úÖ Yes' if api_available else '‚ùå No'}")

In [None]:
# Run edge case tests
edge_case_results = []

for case in edge_cases:
    result = {
        'Case': case['name'],
        'LTV': case['loan_amount'] / case['property_value'] * 100,
        'LTI': case['loan_amount'] / case['income'],
        'Expected': case['expected'],
        'Reason': case['reason']
    }
    
    if api_available:
        api_result = test_edge_case_api(case)
        if 'error' not in api_result:
            result['Prediction'] = api_result.get('prediction', 'Unknown')
            result['Probability'] = api_result.get('probability', 0)
            result['Risk_Level'] = api_result.get('risk_level', 'Unknown')
            result['Match'] = '‚úÖ' if case['expected'] in [result['Prediction'], result['Risk_Level']] else '‚ö†Ô∏è'
        else:
            result['Prediction'] = 'API Error'
            result['Match'] = '‚ùì'
    else:
        result['Prediction'] = 'API Unavailable'
        result['Match'] = '‚ùì'
    
    edge_case_results.append(result)

edge_df = pd.DataFrame(edge_case_results)
print("\n" + "="*80)
print("EDGE CASE VALIDATION RESULTS")
print("="*80)
display(edge_df)

---
## Task 10.2: System Testing

### 10.2.1 End-to-End Integration Testing

In [None]:
# Integration test suite
def run_integration_tests():
    """Run end-to-end integration tests."""
    results = []
    
    # Test 1: Health endpoint
    test_name = "Health Endpoint"
    try:
        resp = requests.get(f"{API_URL}/health", timeout=5)
        passed = resp.status_code == 200 and 'status' in resp.json()
        results.append({'Test': test_name, 'Status': '‚úÖ PASS' if passed else '‚ùå FAIL', 'Details': resp.json().get('status', 'N/A')})
    except Exception as e:
        results.append({'Test': test_name, 'Status': '‚ùå FAIL', 'Details': str(e)[:50]})
    
    # Test 2: Predict endpoint - valid input
    test_name = "Predict - Valid Input"
    try:
        payload = {'loan_amount': 250000, 'property_value': 300000, 'income': 80000, 'interest_rate': 6.5, 'loan_term': 360}
        resp = requests.post(f"{API_URL}/predict", json=payload, timeout=10)
        passed = resp.status_code == 200 and 'prediction' in resp.json()
        results.append({'Test': test_name, 'Status': '‚úÖ PASS' if passed else '‚ùå FAIL', 'Details': f"Returned: {resp.json().get('prediction', 'N/A')}"})
    except Exception as e:
        results.append({'Test': test_name, 'Status': '‚ùå FAIL', 'Details': str(e)[:50]})
    
    # Test 3: Predict endpoint - invalid input
    test_name = "Predict - Invalid Input (Validation)"
    try:
        payload = {'loan_amount': -1000, 'property_value': 300000, 'income': 80000, 'interest_rate': 6.5, 'loan_term': 360}
        resp = requests.post(f"{API_URL}/predict", json=payload, timeout=5)
        passed = resp.status_code == 422  # Validation error expected
        results.append({'Test': test_name, 'Status': '‚úÖ PASS' if passed else '‚ùå FAIL', 'Details': f"Status: {resp.status_code}"})
    except Exception as e:
        results.append({'Test': test_name, 'Status': '‚ùå FAIL', 'Details': str(e)[:50]})
    
    # Test 4: Explain endpoint
    test_name = "Explain Endpoint"
    try:
        payload = {'loan_amount': 250000, 'property_value': 300000, 'income': 80000, 'interest_rate': 6.5, 'loan_term': 360}
        resp = requests.post(f"{API_URL}/explain", json=payload, timeout=30)
        passed = resp.status_code == 200 and 'explanation_text' in resp.json()
        results.append({'Test': test_name, 'Status': '‚úÖ PASS' if passed else '‚ùå FAIL', 'Details': 'Explanation returned' if passed else f"Status: {resp.status_code}"})
    except Exception as e:
        results.append({'Test': test_name, 'Status': '‚ùå FAIL', 'Details': str(e)[:50]})
    
    # Test 5: Batch predict endpoint
    test_name = "Batch Predict Endpoint"
    try:
        payload = {'applications': [
            {'loan_amount': 250000, 'property_value': 300000, 'income': 80000, 'interest_rate': 6.5, 'loan_term': 360},
            {'loan_amount': 400000, 'property_value': 500000, 'income': 120000, 'interest_rate': 7.0, 'loan_term': 360}
        ]}
        resp = requests.post(f"{API_URL}/batch/predict", json=payload, timeout=20)
        passed = resp.status_code == 200 and resp.json().get('total_processed') == 2
        results.append({'Test': test_name, 'Status': '‚úÖ PASS' if passed else '‚ùå FAIL', 'Details': f"Processed: {resp.json().get('total_processed', 0)}"})
    except Exception as e:
        results.append({'Test': test_name, 'Status': '‚ùå FAIL', 'Details': str(e)[:50]})
    
    return pd.DataFrame(results)

if api_available:
    integration_results = run_integration_tests()
    print("\n" + "="*80)
    print("INTEGRATION TEST RESULTS")
    print("="*80)
    display(integration_results)
    
    passed_count = (integration_results['Status'].str.contains('PASS')).sum()
    total_count = len(integration_results)
    print(f"\nPassed: {passed_count}/{total_count}")
else:
    print("‚ö†Ô∏è API not available. Start API with: uvicorn src.api.main:app --port 8000")

### 10.2.2 Performance Testing (Latency)

In [None]:
# Performance testing
def run_performance_tests(n_requests=50):
    """Run performance/latency tests."""
    latencies = []
    payload = {'loan_amount': 250000, 'property_value': 300000, 'income': 80000, 'interest_rate': 6.5, 'loan_term': 360}
    
    print(f"Running {n_requests} requests...")
    
    for i in range(n_requests):
        start = time.time()
        try:
            resp = requests.post(f"{API_URL}/predict", json=payload, timeout=10)
            elapsed_ms = (time.time() - start) * 1000
            if resp.status_code == 200:
                latencies.append(elapsed_ms)
        except:
            pass
        
        if (i + 1) % 10 == 0:
            print(f"  Completed {i + 1}/{n_requests}")
    
    return latencies

LATENCY_THRESHOLD_MS = 500  # Success criteria

if api_available:
    latencies = run_performance_tests(n_requests=50)
    
    if latencies:
        print("\n" + "="*60)
        print("PERFORMANCE TEST RESULTS")
        print("="*60)
        
        mean_latency = np.mean(latencies)
        p50 = np.percentile(latencies, 50)
        p95 = np.percentile(latencies, 95)
        p99 = np.percentile(latencies, 99)
        max_latency = np.max(latencies)
        
        print(f"\nLatency Statistics (ms):")
        print(f"  Mean:    {mean_latency:.2f} ms {'‚úÖ' if mean_latency < LATENCY_THRESHOLD_MS else '‚ùå'}")
        print(f"  P50:     {p50:.2f} ms")
        print(f"  P95:     {p95:.2f} ms {'‚úÖ' if p95 < LATENCY_THRESHOLD_MS else '‚ùå'}")
        print(f"  P99:     {p99:.2f} ms")
        print(f"  Max:     {max_latency:.2f} ms")
        print(f"\nThreshold: < {LATENCY_THRESHOLD_MS} ms")
        print(f"Result: {'‚úÖ PASS' if p95 < LATENCY_THRESHOLD_MS else '‚ùå FAIL'}")
else:
    print("‚ö†Ô∏è API not available for performance testing")

In [None]:
# Visualize latency distribution
if api_available and latencies:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Histogram
    axes[0].hist(latencies, bins=20, color='steelblue', edgecolor='white', alpha=0.7)
    axes[0].axvline(x=LATENCY_THRESHOLD_MS, color='red', linestyle='--', linewidth=2, label=f'Threshold ({LATENCY_THRESHOLD_MS}ms)')
    axes[0].axvline(x=np.mean(latencies), color='green', linestyle='-', linewidth=2, label=f'Mean ({np.mean(latencies):.1f}ms)')
    axes[0].set_xlabel('Latency (ms)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('API Response Latency Distribution')
    axes[0].legend()
    
    # Box plot
    axes[1].boxplot(latencies, vert=True)
    axes[1].axhline(y=LATENCY_THRESHOLD_MS, color='red', linestyle='--', linewidth=2, label=f'Threshold ({LATENCY_THRESHOLD_MS}ms)')
    axes[1].set_ylabel('Latency (ms)')
    axes[1].set_title('Latency Box Plot')
    axes[1].legend()
    
    plt.tight_layout()
    plt.savefig(RESULTS_DIR / 'performance_test_results.png', dpi=150, bbox_inches='tight')
    plt.show()

---
## Task 10.3: Error Analysis

### 10.3.1 Confusion Matrix Deep Dive

In [None]:
# Confusion matrix analysis
cm = confusion_matrix(y_test, y_pred_best)
tn, fp, fn, tp = cm.ravel()

print("Confusion Matrix Breakdown:")
print(f"  True Negatives (TN):  {tn:,} - Correctly denied bad applications")
print(f"  False Positives (FP): {fp:,} - Bad loans approved (Type I Error) ‚ö†Ô∏è")
print(f"  False Negatives (FN): {fn:,} - Good borrowers denied (Type II Error) ‚ö†Ô∏è")
print(f"  True Positives (TP):  {tp:,} - Correctly approved good applications")

# Error rates
fpr_rate = fp / (fp + tn) if (fp + tn) > 0 else 0
fnr_rate = fn / (fn + tp) if (fn + tp) > 0 else 0

print(f"\nError Rates:")
print(f"  False Positive Rate: {fpr_rate:.2%}")
print(f"  False Negative Rate: {fnr_rate:.2%}")

In [None]:
# Visualize confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Denied', 'Approved'],
            yticklabels=['Denied', 'Approved'],
            ax=ax, cbar=True)

ax.set_xlabel('Predicted', fontsize=12)
ax.set_ylabel('Actual', fontsize=12)
ax.set_title(f'Confusion Matrix - {best_model_name}', fontsize=14)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

### 10.3.2 False Positive Analysis (Bad Loans Approved)

In [None]:
# Identify false positives
fp_mask = (y_test == 0) & (y_pred_best == 1)
fp_indices = np.where(fp_mask)[0]

print(f"False Positives (Bad loans approved): {len(fp_indices):,}")
print(f"Percentage of test set: {len(fp_indices) / len(y_test) * 100:.2f}%")

# Analyze characteristics of FPs
if len(fp_indices) > 0:
    fp_df = test_df.iloc[fp_indices].copy()
    fp_df['probability'] = y_prob_best[fp_indices]
    
    print(f"\nFalse Positive Characteristics:")
    
    # Check for common features
    numeric_features = ['loan_amount', 'property_value', 'income', 'interest_rate', 
                        'loan_to_income_ratio', 'loan_to_value_ratio']
    
    for feat in numeric_features:
        if feat in fp_df.columns:
            fp_mean = fp_df[feat].mean()
            overall_mean = test_df[feat].mean()
            diff_pct = (fp_mean - overall_mean) / overall_mean * 100 if overall_mean != 0 else 0
            print(f"  {feat}: FP Mean = {fp_mean:.2f}, Overall Mean = {overall_mean:.2f} ({diff_pct:+.1f}%)")

In [None]:
# Visualize FP probability distribution
if len(fp_indices) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Probability distribution of FPs
    axes[0].hist(y_prob_best[fp_indices], bins=20, color='red', alpha=0.7, edgecolor='white', label='False Positives')
    axes[0].axvline(x=0.5, color='black', linestyle='--', linewidth=2, label='Decision Threshold')
    axes[0].set_xlabel('Predicted Probability')
    axes[0].set_ylabel('Count')
    axes[0].set_title('Probability Distribution of False Positives')
    axes[0].legend()
    
    # Compare TP vs FP probabilities
    tp_mask = (y_test == 1) & (y_pred_best == 1)
    axes[1].hist(y_prob_best[tp_mask], bins=20, color='green', alpha=0.5, label='True Positives', edgecolor='white')
    axes[1].hist(y_prob_best[fp_mask], bins=20, color='red', alpha=0.5, label='False Positives', edgecolor='white')
    axes[1].set_xlabel('Predicted Probability')
    axes[1].set_ylabel('Count')
    axes[1].set_title('TP vs FP Probability Comparison')
    axes[1].legend()
    
    plt.tight_layout()
    plt.savefig(RESULTS_DIR / 'false_positive_analysis.png', dpi=150, bbox_inches='tight')
    plt.show()

### 10.3.3 False Negative Analysis (Good Borrowers Denied)

In [None]:
# Identify false negatives
fn_mask = (y_test == 1) & (y_pred_best == 0)
fn_indices = np.where(fn_mask)[0]

print(f"False Negatives (Good borrowers denied): {len(fn_indices):,}")
print(f"Percentage of test set: {len(fn_indices) / len(y_test) * 100:.2f}%")

# Analyze characteristics of FNs
if len(fn_indices) > 0:
    fn_df = test_df.iloc[fn_indices].copy()
    fn_df['probability'] = y_prob_best[fn_indices]
    
    print(f"\nFalse Negative Characteristics:")
    
    for feat in numeric_features:
        if feat in fn_df.columns:
            fn_mean = fn_df[feat].mean()
            overall_mean = test_df[feat].mean()
            diff_pct = (fn_mean - overall_mean) / overall_mean * 100 if overall_mean != 0 else 0
            print(f"  {feat}: FN Mean = {fn_mean:.2f}, Overall Mean = {overall_mean:.2f} ({diff_pct:+.1f}%)")

In [None]:
# Visualize FN probability distribution
if len(fn_indices) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Probability distribution of FNs
    axes[0].hist(y_prob_best[fn_indices], bins=20, color='orange', alpha=0.7, edgecolor='white', label='False Negatives')
    axes[0].axvline(x=0.5, color='black', linestyle='--', linewidth=2, label='Decision Threshold')
    axes[0].set_xlabel('Predicted Probability')
    axes[0].set_ylabel('Count')
    axes[0].set_title('Probability Distribution of False Negatives')
    axes[0].legend()
    
    # Compare TN vs FN probabilities
    tn_mask = (y_test == 0) & (y_pred_best == 0)
    axes[1].hist(y_prob_best[tn_mask], bins=20, color='blue', alpha=0.5, label='True Negatives', edgecolor='white')
    axes[1].hist(y_prob_best[fn_mask], bins=20, color='orange', alpha=0.5, label='False Negatives', edgecolor='white')
    axes[1].set_xlabel('Predicted Probability')
    axes[1].set_ylabel('Count')
    axes[1].set_title('TN vs FN Probability Comparison')
    axes[1].legend()
    
    plt.tight_layout()
    plt.savefig(RESULTS_DIR / 'false_negative_analysis.png', dpi=150, bbox_inches='tight')
    plt.show()

### 10.3.4 Threshold Analysis

In [None]:
# Analyze different decision thresholds
thresholds = np.arange(0.3, 0.8, 0.05)

threshold_results = []
for thresh in thresholds:
    y_pred_thresh = (y_prob_best >= thresh).astype(int)
    
    prec = precision_score(y_test, y_pred_thresh, zero_division=0)
    rec = recall_score(y_test, y_pred_thresh, zero_division=0)
    f1 = f1_score(y_test, y_pred_thresh, zero_division=0)
    
    cm_t = confusion_matrix(y_test, y_pred_thresh)
    if cm_t.shape == (2, 2):
        tn_t, fp_t, fn_t, tp_t = cm_t.ravel()
    else:
        tn_t, fp_t, fn_t, tp_t = 0, 0, 0, 0
    
    threshold_results.append({
        'Threshold': thresh,
        'Precision': prec,
        'Recall': rec,
        'F1': f1,
        'FP': fp_t,
        'FN': fn_t,
        'Meets_Criteria': prec >= 0.80 and rec >= 0.70
    })

threshold_df = pd.DataFrame(threshold_results)
print("Threshold Analysis:")
display(threshold_df.style.format({'Threshold': '{:.2f}', 'Precision': '{:.4f}', 'Recall': '{:.4f}', 'F1': '{:.4f}'}))

In [None]:
# Visualize threshold trade-offs
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Precision, Recall, F1 vs Threshold
axes[0].plot(threshold_df['Threshold'], threshold_df['Precision'], 'b-', marker='o', label='Precision')
axes[0].plot(threshold_df['Threshold'], threshold_df['Recall'], 'g-', marker='s', label='Recall')
axes[0].plot(threshold_df['Threshold'], threshold_df['F1'], 'r-', marker='^', label='F1')
axes[0].axhline(y=0.80, color='blue', linestyle=':', alpha=0.5, label='Precision Target (0.80)')
axes[0].axhline(y=0.70, color='green', linestyle=':', alpha=0.5, label='Recall Target (0.70)')
axes[0].set_xlabel('Decision Threshold')
axes[0].set_ylabel('Score')
axes[0].set_title('Metrics vs Decision Threshold')
axes[0].legend(loc='best')
axes[0].grid(True, alpha=0.3)

# FP and FN vs Threshold
axes[1].plot(threshold_df['Threshold'], threshold_df['FP'], 'r-', marker='o', label='False Positives')
axes[1].plot(threshold_df['Threshold'], threshold_df['FN'], 'orange', marker='s', label='False Negatives')
axes[1].set_xlabel('Decision Threshold')
axes[1].set_ylabel('Count')
axes[1].set_title('Error Count vs Decision Threshold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(RESULTS_DIR / 'threshold_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

### 10.3.5 Model Limitations Documentation

In [None]:
# Document model limitations
limitations = {
    "Data Limitations": [
        "Training data limited to NJ, NY, PA, CT states (may not generalize to other regions)",
        "2024 HMDA data only - market conditions change over time",
        "Missing DTI ratio data required imputation/estimation",
        "Owner-occupied purchase loans only (not refinance or investment)"
    ],
    "Model Limitations": [
        f"False Positive Rate: {fpr_rate:.2%} - Some bad loans still approved",
        f"False Negative Rate: {fnr_rate:.2%} - Some qualified borrowers denied",
        "Fair representation encoding may lose some predictive signal",
        "Model trained on historical approvals (may perpetuate past biases)"
    ],
    "Operational Limitations": [
        "Requires all input features - missing data reduces accuracy",
        "API response includes SHAP explanations (adds latency for /explain)",
        "Model should be retrained periodically as market conditions change",
        "Not a replacement for human underwriter review on edge cases"
    ],
    "Fairness Considerations": [
        "Fairness metrics monitored for race, ethnicity, sex",
        "Some disparity may remain even with fair representation",
        "Intersectional fairness not fully evaluated",
        "Should be used as decision support, not sole decision maker"
    ]
}

print("="*80)
print("MODEL LIMITATIONS DOCUMENTATION")
print("="*80)

for category, items in limitations.items():
    print(f"\n{category}:")
    for item in items:
        print(f"  ‚Ä¢ {item}")

---
## Summary: Testing & Validation Results

In [None]:
# Generate final summary
summary = {
    'timestamp': datetime.now().isoformat(),
    'model': best_model_name,
    'test_set_size': len(y_test),
    'success_criteria': {
        'AUC_threshold': 0.75,
        'AUC_achieved': float(roc_auc_score(y_test, y_prob_best)),
        'AUC_pass': bool(roc_auc_score(y_test, y_prob_best) >= 0.75),
        'Precision_threshold': 0.80,
        'Precision_achieved': float(precision_score(y_test, y_pred_best)),
        'Precision_pass': bool(precision_score(y_test, y_pred_best) >= 0.80),
        'Recall_threshold': 0.70,
        'Recall_achieved': float(recall_score(y_test, y_pred_best)),
        'Recall_pass': bool(recall_score(y_test, y_pred_best) >= 0.70)
    },
    'error_analysis': {
        'false_positives': int(fp),
        'false_negatives': int(fn),
        'fp_rate': float(fpr_rate),
        'fn_rate': float(fnr_rate)
    },
    'performance': {
        'mean_latency_ms': float(np.mean(latencies)) if api_available and latencies else None,
        'p95_latency_ms': float(np.percentile(latencies, 95)) if api_available and latencies else None,
        'latency_pass': bool(np.percentile(latencies, 95) < 500) if api_available and latencies else None
    }
}

# Save summary
summary_path = RESULTS_DIR / f"validation_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)

print("="*80)
print("PHASE 10: TESTING & VALIDATION SUMMARY")
print("="*80)

all_criteria_pass = all([
    summary['success_criteria']['AUC_pass'],
    summary['success_criteria']['Precision_pass'],
    summary['success_criteria']['Recall_pass']
])

print(f"\n‚úÖ Model: {best_model_name}")
print(f"\nüìä Success Criteria:")
print(f"   AUC-ROC:   {summary['success_criteria']['AUC_achieved']:.4f} (‚â•0.75) {'‚úÖ' if summary['success_criteria']['AUC_pass'] else '‚ùå'}")
print(f"   Precision: {summary['success_criteria']['Precision_achieved']:.4f} (‚â•0.80) {'‚úÖ' if summary['success_criteria']['Precision_pass'] else '‚ùå'}")
print(f"   Recall:    {summary['success_criteria']['Recall_achieved']:.4f} (‚â•0.70) {'‚úÖ' if summary['success_criteria']['Recall_pass'] else '‚ùå'}")

print(f"\n‚ö†Ô∏è Error Analysis:")
print(f"   False Positives: {fp:,} ({fpr_rate:.2%})")
print(f"   False Negatives: {fn:,} ({fnr_rate:.2%})")

if api_available and latencies:
    print(f"\n‚è±Ô∏è Performance:")
    print(f"   Mean Latency: {summary['performance']['mean_latency_ms']:.1f}ms")
    print(f"   P95 Latency:  {summary['performance']['p95_latency_ms']:.1f}ms (<500ms) {'‚úÖ' if summary['performance']['latency_pass'] else '‚ùå'}")

print(f"\n{'='*80}")
print(f"OVERALL RESULT: {'‚úÖ ALL CRITERIA MET' if all_criteria_pass else '‚ùå SOME CRITERIA NOT MET'}")
print(f"{'='*80}")
print(f"\nSummary saved to: {summary_path}")