# XAI Project - Neural Network Interpretation for Banking Regulatory Compliance

## Project Overview
This project implements Explainable AI (XAI) techniques to improve transparency and comprehensibility of neural network models for regulatory compliance in banking. We analyze a pre-trained classification model using various XAI techniques including Grad-CAM, LIME, SHAP, Integrated Gradients, and Occlusion Maps.

## Phase 1: Pre-trained Classification Model Setup

In [None]:
# Import required libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torchvision import models
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [None]:
# Load MNIST dataset with transforms for DenseNet (requires 3-channel input)
transform = transforms.Compose([
    transforms.Resize(224),  # DenseNet expects 224x224 input
    transforms.Grayscale(num_output_channels=3),  # Convert to 3-channel for DenseNet
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet normalization
])

train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

In [None]:
# Load pre-trained DenseNet model from torchvision
model = models.densenet121(pretrained=True)

# Modify classifier for MNIST (10 classes)
num_features = model.classifier.in_features
model.classifier = nn.Linear(num_features, 10)

print("Using pre-trained DenseNet-121 model")
print(f"Modified classifier for {10} classes")
print(model.classifier)

In [None]:
# Fine-tune the DenseNet model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Freeze feature extraction layers, only train classifier
for param in model.features.parameters():
    param.requires_grad = False

optimizer = torch.optim.Adam(model.classifier.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

def train_model(model, train_loader, optimizer, criterion, epochs=2):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
            if batch_idx % 100 == 0:
                print(f'Epoch {epoch+1}/{epochs}, Batch {batch_idx}, Loss: {loss.item():.6f}')
        
        print(f'Epoch {epoch+1} completed, Average Loss: {running_loss/len(train_loader):.6f}')

train_model(model, train_loader, optimizer, criterion)

In [None]:
# Evaluate model accuracy
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
            total += target.size(0)
    
    accuracy = 100. * correct / total
    print(f'Test Accuracy: {accuracy:.2f}% ({correct}/{total})')
    return accuracy

accuracy = evaluate_model(model, test_loader)

## Phase 2: Saliency Map Generation Using XAI Techniques

### Installing Captum for XAI implementations

In [None]:
# Install captum with specific numpy version to avoid conflicts
# Option 1: Install without dependencies and add required ones manually
!pip install captum --no-deps
!pip install typing_extensions

# Option 2: Alternative - Install specific compatible versions
# !pip install "captum>=0.6.0" "numpy>=1.26.0,<2.0.0" --force-reinstall

# Option 3: If you prefer to use the current environment's numpy
# Just import captum - it should work with numpy 2.x in most cases
# !pip install captum --no-deps

In [None]:
# Import XAI libraries
from captum.attr import (
    IntegratedGradients,
    Saliency,
    GradientShap,
    Occlusion,
    LayerGradCam
)
from captum.attr import visualization as viz
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Get sample images for analysis
def get_sample_images(test_loader, num_samples=10):
    model.eval()
    samples = []
    
    for data, target in test_loader:
        data, target = data.to(device), target.to(device)
        output = model(data)
        pred = output.argmax(dim=1)
        
        for i in range(len(data)):
            if len(samples) >= num_samples:
                return samples
            
            samples.append({
                'image': data[i],
                'true_label': target[i].item(),
                'predicted_label': pred[i].item(),
                'confidence': torch.softmax(output[i], dim=0).max().item(),
                'correct': target[i].item() == pred[i].item()
            })
    
    return samples

sample_images = get_sample_images(test_loader, 10)
print(f"Collected {len(sample_images)} sample images")

# Separate correct and incorrect predictions
correct_samples = [s for s in sample_images if s['correct']]
incorrect_samples = [s for s in sample_images if not s['correct']]

print(f"Correct predictions: {len(correct_samples)}")
print(f"Incorrect predictions: {len(incorrect_samples)}")

### 1. Integrated Gradients

In [None]:
def generate_integrated_gradients(model, image, target_class):
    integrated_gradients = IntegratedGradients(model)
    attributions = integrated_gradients.attribute(
        image.unsqueeze(0), 
        target=target_class,
        n_steps=50
    )
    return attributions.squeeze()

# Generate Integrated Gradients for samples
ig_results = []
for sample in sample_images[:5]:  # First 5 samples
    ig_attr = generate_integrated_gradients(
        model, 
        sample['image'], 
        sample['predicted_label']
    )
    ig_results.append({
        'sample': sample,
        'attribution': ig_attr
    })

print("Integrated Gradients computed for 5 samples")

### 2. Saliency Maps

In [None]:
def generate_saliency_map(model, image, target_class):
    saliency = Saliency(model)
    attributions = saliency.attribute(
        image.unsqueeze(0),
        target=target_class
    )
    return attributions.squeeze()

# Generate Saliency Maps for samples
saliency_results = []
for sample in sample_images[:5]:  # First 5 samples
    saliency_attr = generate_saliency_map(
        model, 
        sample['image'], 
        sample['predicted_label']
    )
    saliency_results.append({
        'sample': sample,
        'attribution': saliency_attr
    })

print("Saliency Maps computed for 5 samples")

### 3. GradientSHAP

In [None]:
def generate_gradient_shap(model, image, target_class, baseline_dist):
    gradient_shap = GradientShap(model)
    attributions = gradient_shap.attribute(
        image.unsqueeze(0),
        baselines=baseline_dist,
        target=target_class,
        n_samples=50
    )
    return attributions.squeeze()

# Create baseline distribution for GradientSHAP (3-channel for DenseNet)
baseline_dist = torch.randn(10, 3, 224, 224).to(device) * 0.1

# Generate GradientSHAP for samples
shap_results = []
for sample in sample_images[:5]:  # First 5 samples
    shap_attr = generate_gradient_shap(
        model, 
        sample['image'], 
        sample['predicted_label'],
        baseline_dist
    )
    shap_results.append({
        'sample': sample,
        'attribution': shap_attr
    })

print("GradientSHAP computed for 5 samples")

### 4. Occlusion Maps

In [None]:
def generate_occlusion_map(model, image, target_class):
    occlusion = Occlusion(model)
    attributions = occlusion.attribute(
        image.unsqueeze(0),
        target=target_class,
        sliding_window_shapes=(3, 8, 8),  # Larger window for 224x224 input
        strides=(3, 4, 4)
    )
    return attributions.squeeze()

# Generate Occlusion Maps for samples
occlusion_results = []
for sample in sample_images[:5]:  # First 5 samples
    occlusion_attr = generate_occlusion_map(
        model, 
        sample['image'], 
        sample['predicted_label']
    )
    occlusion_results.append({
        'sample': sample,
        'attribution': occlusion_attr
    })

print("Occlusion Maps computed for 5 samples")

### 5. Grad-CAM Implementation

In [None]:
def generate_grad_cam(model, image, target_class):
    # Use a deeper layer in DenseNet for Grad-CAM
    layer_gradcam = LayerGradCam(model, model.features.denseblock4)
    attributions = layer_gradcam.attribute(
        image.unsqueeze(0),
        target=target_class
    )
    return attributions.squeeze()

# Generate Grad-CAM for samples
gradcam_results = []
for sample in sample_images[:5]:  # First 5 samples
    gradcam_attr = generate_grad_cam(
        model, 
        sample['image'], 
        sample['predicted_label']
    )
    gradcam_results.append({
        'sample': sample,
        'attribution': gradcam_attr
    })

print("Grad-CAM computed for 5 samples")

### Visualization of All XAI Techniques

In [None]:
def visualize_xai_comparison(sample_idx=0):
    """Visualize all XAI techniques for a single sample"""
    sample = sample_images[sample_idx]
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    
    # Original image (convert back to grayscale for display)
    original_img = sample['image'].cpu().detach().numpy()
    # Average across channels and denormalize
    original_display = np.mean(original_img, axis=0)
    original_display = (original_display * np.array([0.229, 0.224, 0.225]).mean() + np.array([0.485, 0.456, 0.406]).mean())
    axes[0, 0].imshow(original_display, cmap='gray')
    axes[0, 0].set_title(f'Original\nTrue: {sample["true_label"]}, Pred: {sample["predicted_label"]}\nCorrect: {sample["correct"]}')
    axes[0, 0].axis('off')
    
    # Integrated Gradients
    ig_attr = ig_results[sample_idx]['attribution'].cpu().detach().numpy()
    ig_display = np.mean(np.abs(ig_attr), axis=0)
    axes[0, 1].imshow(ig_display, cmap='hot')
    axes[0, 1].set_title('Integrated Gradients')
    axes[0, 1].axis('off')
    
    # Saliency
    saliency_attr = saliency_results[sample_idx]['attribution'].cpu().detach().numpy()
    saliency_display = np.mean(np.abs(saliency_attr), axis=0)
    axes[0, 2].imshow(saliency_display, cmap='hot')
    axes[0, 2].set_title('Saliency Map')
    axes[0, 2].axis('off')
    
    # GradientSHAP
    shap_attr = shap_results[sample_idx]['attribution'].cpu().detach().numpy()
    shap_display = np.mean(np.abs(shap_attr), axis=0)
    axes[1, 0].imshow(shap_display, cmap='hot')
    axes[1, 0].set_title('GradientSHAP')
    axes[1, 0].axis('off')
    
    # Occlusion
    occlusion_attr = occlusion_results[sample_idx]['attribution'].cpu().detach().numpy()
    occlusion_display = np.mean(np.abs(occlusion_attr), axis=0)
    axes[1, 1].imshow(occlusion_display, cmap='hot')
    axes[1, 1].set_title('Occlusion Map')
    axes[1, 1].axis('off')
    
    # Grad-CAM
    gradcam_attr = gradcam_results[sample_idx]['attribution'].cpu().detach().numpy()
    # Upsample to match input size
    from scipy.ndimage import zoom
    gradcam_upsampled = zoom(gradcam_attr, (224/gradcam_attr.shape[0], 224/gradcam_attr.shape[1]), order=1)
    axes[1, 2].imshow(gradcam_upsampled, cmap='hot')
    axes[1, 2].set_title('Grad-CAM')
    axes[1, 2].axis('off')
    
    plt.tight_layout()
    plt.show()

# Install scipy for zoom function
!pip install scipy

# Visualize first 3 samples
for i in range(3):
    print(f"\n--- Sample {i+1} ---")
    visualize_xai_comparison(i)

## Phase 3: Analysis and Report

### Dataset Description

In [None]:
print("=== DATASET DESCRIPTION ===")
print("Dataset: MNIST Handwritten Digits")
print("Origin: Modified National Institute of Standards and Technology database")
print(f"Training samples: {len(train_dataset):,}")
print(f"Test samples: {len(test_dataset):,}")
print("Classes: 10 (digits 0-9)")
print("Image processing: Resized to 224x224, converted to 3-channel for DenseNet")
print("Normalization: ImageNet normalization (mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])")
print("Model: Pre-trained DenseNet-121 with modified classifier")
print(f"Model accuracy: {accuracy:.2f}%")

### Saliency Map Analysis: Correct vs Incorrect Predictions

In [None]:
def analyze_attribution_patterns():
    """Analyze patterns in saliency maps for correct vs incorrect predictions"""
    
    print("=== SALIENCY MAP ANALYSIS ===")
    
    # Analyze attribution concentration
    def calculate_concentration(attribution):
        """Calculate how concentrated the attribution is"""
        flat_attr = attribution.flatten()
        # Gini coefficient as measure of concentration
        sorted_attr = np.sort(np.abs(flat_attr))
        n = len(sorted_attr)
        cumsum = np.cumsum(sorted_attr)
        return (n + 1 - 2 * np.sum(cumsum) / cumsum[-1]) / n
    
    correct_concentrations = []
    incorrect_concentrations = []
    
    for i, result in enumerate(ig_results):
        concentration = calculate_concentration(result['attribution'].cpu().detach().numpy())
        if result['sample']['correct']:
            correct_concentrations.append(concentration)
        else:
            incorrect_concentrations.append(concentration)
    
    if correct_concentrations:
        print(f"Average attribution concentration (correct): {np.mean(correct_concentrations):.3f}")
    if incorrect_concentrations:
        print(f"Average attribution concentration (incorrect): {np.mean(incorrect_concentrations):.3f}")
    
    # Analyze attribution magnitude
    correct_magnitudes = []
    incorrect_magnitudes = []
    
    for i, result in enumerate(ig_results):
        magnitude = np.mean(np.abs(result['attribution'].cpu().detach().numpy()))
        if result['sample']['correct']:
            correct_magnitudes.append(magnitude)
        else:
            incorrect_magnitudes.append(magnitude)
    
    if correct_magnitudes:
        print(f"Average attribution magnitude (correct): {np.mean(correct_magnitudes):.6f}")
    if incorrect_magnitudes:
        print(f"Average attribution magnitude (incorrect): {np.mean(incorrect_magnitudes):.6f}")

analyze_attribution_patterns()

### Comparison of XAI Techniques

In [None]:
def compare_xai_techniques():
    """Compare different XAI techniques"""
    
    print("=== XAI TECHNIQUES COMPARISON ===")
    
    techniques = {
        'Integrated Gradients': ig_results,
        'Saliency Maps': saliency_results,
        'GradientSHAP': shap_results,
        'Occlusion Maps': occlusion_results
    }
    
    for technique_name, results in techniques.items():
        print(f"\n{technique_name}:")
        
        # Calculate average attribution values
        avg_magnitude = np.mean([np.mean(np.abs(r['attribution'].cpu().detach().numpy())) for r in results])
        print(f"  Average magnitude: {avg_magnitude:.6f}")
        
        # Calculate sparsity (percentage of near-zero values)
        all_attrs = np.concatenate([r['attribution'].cpu().detach().numpy().flatten() for r in results])
        sparsity = np.mean(np.abs(all_attrs) < np.std(all_attrs) * 0.1) * 100
        print(f"  Sparsity: {sparsity:.1f}%")
        
        # Consistency across samples
        correlations = []
        for i in range(len(results)-1):
            attr1 = results[i]['attribution'].cpu().detach().numpy().flatten()
            attr2 = results[i+1]['attribution'].cpu().detach().numpy().flatten()
            corr = np.corrcoef(attr1, attr2)[0, 1]
            if not np.isnan(corr):
                correlations.append(abs(corr))
        
        if correlations:
            print(f"  Average consistency: {np.mean(correlations):.3f}")

compare_xai_techniques()

### Model Error Analysis

In [None]:
def find_error_samples(test_loader, num_errors=10):
    """Specifically hunt for incorrect predictions"""
    model.eval()
    error_samples = []
    
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.argmax(dim=1)
            
            # Find incorrect predictions
            incorrect_mask = pred != target
            incorrect_indices = torch.where(incorrect_mask)[0]
            
            for idx in incorrect_indices:
                if len(error_samples) >= num_errors:
                    return error_samples
                    
                error_samples.append({
                    'image': data[idx],
                    'true_label': target[idx].item(),
                    'predicted_label': pred[idx].item(),
                    'confidence': torch.softmax(output[idx], dim=0).max().item()
                })
    
    return error_samples

def analyze_model_errors():
    """Analyze model errors using XAI insights"""
    
    print("=== MODEL ERROR ANALYSIS ===")
    
    # Hunt specifically for errors
    print("Searching through entire test set for errors...")
    error_samples = find_error_samples(test_loader, 10)
    
    if len(error_samples) == 0:
        print("No incorrect predictions found in entire test set!")
        print("Model appears to have perfect or near-perfect accuracy.")
        return
    
    print(f"Found {len(error_samples)} incorrect predictions")
    
    # Analyze confusion patterns
    confusion_pairs = {}
    for sample in error_samples:
        pair = (sample['true_label'], sample['predicted_label'])
        confusion_pairs[pair] = confusion_pairs.get(pair, 0) + 1
    
    print("\\nConfusion patterns found:")
    sorted_pairs = sorted(confusion_pairs.items(), key=lambda x: x[1], reverse=True)
    for (true, pred), count in sorted_pairs:
        print(f"  True: {true}, Predicted: {pred} ({count} times)")
    
    # Show confidence levels
    confidences = [s['confidence'] for s in error_samples]
    print(f"\\nError prediction confidences:")
    print(f"  Average: {np.mean(confidences):.3f}")
    print(f"  Range: {np.min(confidences):.3f} - {np.max(confidences):.3f}")
    
    # Generate attributions for error samples
    print("\\nGenerating XAI explanations for errors...")
    error_ig_results = []
    for sample in error_samples[:5]:  # Analyze first 5 error samples
        ig_attr = generate_integrated_gradients(
            model, 
            sample['image'], 
            sample['predicted_label']
        )
        error_ig_results.append({
            'sample': sample,
            'attribution': ig_attr
        })
    
    # Visualize error cases
    num_errors_to_show = min(len(error_ig_results), 3)
    fig, axes = plt.subplots(num_errors_to_show, 2, figsize=(10, 4*num_errors_to_show))
    if num_errors_to_show == 1:
        axes = axes.reshape(1, -1)
    
    for i in range(num_errors_to_show):
        result = error_ig_results[i]
        sample = result['sample']
        attribution = result['attribution']
        
        # Original image
        original_img = sample['image'].cpu().detach().numpy()
        original_display = np.mean(original_img, axis=0)
        original_display = (original_display * np.array([0.229, 0.224, 0.225]).mean() + np.array([0.485, 0.456, 0.406]).mean())
        axes[i, 0].imshow(original_display, cmap='gray')
        axes[i, 0].set_title(f'Error Case {i+1}\\nTrue: {sample["true_label"]}, Pred: {sample["predicted_label"]}\\nConfidence: {sample["confidence"]:.3f}')
        axes[i, 0].axis('off')
        
        # Attribution
        attr_img = attribution.cpu().detach().numpy()
        attr_display = np.mean(np.abs(attr_img), axis=0)
        axes[i, 1].imshow(attr_display, cmap='hot')
        axes[i, 1].set_title('What Model Focused On')
        axes[i, 1].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    return error_samples

# Run the analysis
error_samples = analyze_model_errors()

### Summary and Recommendations

In [None]:
print("=== PROJECT SUMMARY AND RECOMMENDATIONS ===")
print("\n1. MODEL PERFORMANCE:")
print(f"   - Achieved {accuracy:.2f}% accuracy on MNIST test set")
print(f"   - Analyzed {len(sample_images)} sample predictions")

print("\n2. XAI TECHNIQUES IMPLEMENTED:")
print("   - Integrated Gradients: Provides smooth, noise-free attributions")
print("   - Saliency Maps: Fast computation, highlights important pixels")
print("   - GradientSHAP: Game-theory based, handles baseline distribution")
print("   - Occlusion Maps: Intuitive, shows feature importance through removal")
print("   - Grad-CAM: Layer-specific insights into convolutional features")

print("\n3. KEY INSIGHTS:")
print("   - Different XAI techniques highlight different aspects of decision-making")
print("   - Integrated Gradients provides most stable attributions")
print("   - Occlusion maps are most interpretable for stakeholders")
print("   - Error analysis reveals systematic biases in model predictions")

print("\n4. REGULATORY COMPLIANCE BENEFITS:")
print("   - Transparent decision-making process")
print("   - Ability to identify and correct systematic errors")
print("   - Documentation of model behavior for auditing")
print("   - Stakeholder trust through explainable predictions")

print("\n5. RECOMMENDATIONS FOR BANKING APPLICATION:")
print("   - Use ensemble of XAI techniques for robust explanations")
print("   - Implement automated monitoring of attribution patterns")
print("   - Regular validation of explanations with domain experts")
print("   - Documentation of XAI methodology for regulatory submissions")