# Preprocessing Pipeline Testing and Validation

This notebook implements and tests the complete image preprocessing pipeline for diabetic retinopathy images.

## Pipeline Steps:
1. **Black Border Cropping**: Remove uninformative black borders
2. **CLAHE**: Contrast Limited Adaptive Histogram Equalization
3. **Ben Graham Normalization**: Illumination correction
4. **Resize & Crop**: Standardize to 224x224 for model input

In [None]:
# Import required libraries
import sys
import os
sys.path.append('../src')

import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set matplotlib style
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully!")
print(f"NumPy version: {np.__version__}")
print(f"Matplotlib version: {plt.matplotlib.__version__}")

In [None]:
# Check if OpenCV is available
try:
    import cv2
    print(f"SUCCESS: OpenCV version: {cv2.__version__}")
    opencv_available = True
except ImportError:
    print("ERROR: OpenCV not installed. Please install with: uv add opencv-python")
    print("Note: Some preprocessing functions will not work without OpenCV.")
    opencv_available = False

In [None]:
# Import our preprocessing modules (only if OpenCV is available)
if opencv_available:
    from src.data.preprocessing import RetinaPreprocessor, load_image, save_image
    from src.utils.preprocessing_utils import (
        create_test_image, 
        visualize_preprocessing_steps,
        compare_before_after,
        analyze_image_statistics,
        test_preprocessing_robustness,
        run_all_tests
    )
    print("SUCCESS: Preprocessing modules imported successfully!")
else:
    print("WARNING: Skipping preprocessing module imports due to missing OpenCV")

## 1. Initialize Preprocessor with Configuration

In [None]:
if opencv_available:
    # Initialize preprocessor with configuration from base_config.yaml
    try:
        from src.utils.config import get_preprocessing_config
        
        # Load preprocessing config
        config = get_preprocessing_config()
        print("Preprocessing configuration:")
        for key, value in config.items():
            print(f"  {key}: {value}")
        
        # Initialize preprocessor with config values
        preprocessor = RetinaPreprocessor(
            clahe_clip_limit=config.get('clahe', {}).get('clip_limit', 2.0),
            clahe_tile_grid_size=tuple(config.get('clahe', {}).get('tile_grid_size', [8, 8])),
            ben_graham_sigma=config.get('ben_graham', {}).get('sigma', 10.0),
            resize_to=256,  # Before crop
            crop_to=config.get('image_size', [224, 224])[0]  # Final size
        )
        
    except Exception as e:
        print(f"Could not load config: {e}")
        print("Using default parameters...")
        
        # Initialize with default parameters
        preprocessor = RetinaPreprocessor(
            clahe_clip_limit=2.0,
            clahe_tile_grid_size=(8, 8),
            ben_graham_sigma=10.0,
            resize_to=256,
            crop_to=224
        )
    
    print("\nSUCCESS: Preprocessor initialized successfully!")
    print(f"Configuration:")
    print(f"  CLAHE clip limit: {preprocessor.clahe_clip_limit}")
    print(f"  CLAHE tile grid size: {preprocessor.clahe_tile_grid_size}")
    print(f"  Ben Graham sigma: {preprocessor.ben_graham_sigma}")
    print(f"  Resize to: {preprocessor.resize_to}")
    print(f"  Final crop to: {preprocessor.crop_to}")
else:
    print("WARNING: Skipping preprocessor initialization due to missing OpenCV")

## 2. Create Synthetic Test Images

Since we don't have real retinal images yet, we'll create synthetic test images to validate our preprocessing pipeline.

In [None]:
if opencv_available:
    # Create test images with different sizes
    test_sizes = [(400, 400), (600, 800), (512, 512)]
    test_images = []
    
    for i, size in enumerate(test_sizes):
        test_img = create_test_image(size)
        test_images.append(test_img)
        print(f"Created test image {i+1}: {size} -> {test_img.shape}")
    
    # Display the test images
    fig, axes = plt.subplots(1, len(test_images), figsize=(15, 5))
    for i, img in enumerate(test_images):
        axes[i].imshow(img)
        axes[i].set_title(f'Test Image {i+1}\n{img.shape[0]}x{img.shape[1]}')
        axes[i].axis('off')
    
    plt.suptitle('Synthetic Test Images', fontsize=14)
    plt.tight_layout()
    plt.show()
    
    print(f"\nSUCCESS: Created {len(test_images)} synthetic test images")
else:
    print("WARNING: Skipping test image creation due to missing OpenCV")

## 3. Test Individual Preprocessing Steps

In [None]:
if opencv_available and len(test_images) > 0:
    # Select first test image for detailed analysis
    test_image = test_images[0]
    
    print("Testing individual preprocessing steps...")
    print("=" * 50)
    
    # Analyze original image statistics
    original_stats = analyze_image_statistics(test_image, "Original Image")
    
    # Step 1: Black border cropping
    print("\n1. Testing black border cropping...")
    cropped = preprocessor.crop_black_borders(test_image)
    cropped_stats = analyze_image_statistics(cropped, "After Border Cropping")
    
    # Step 2: CLAHE
    print("\n2. Testing CLAHE...")
    clahe_applied = preprocessor.apply_clahe(cropped)
    clahe_stats = analyze_image_statistics(clahe_applied, "After CLAHE")
    
    # Step 3: Ben Graham normalization
    print("\n3. Testing Ben Graham normalization...")
    normalized = preprocessor.ben_graham_normalization(clahe_applied)
    normalized_stats = analyze_image_statistics(normalized, "After Ben Graham")
    
    # Step 4: Resize and crop
    print("\n4. Testing resize and crop...")
    final = preprocessor.resize_and_crop(normalized)
    final_stats = analyze_image_statistics(final, "Final Processed")
    
    print("\nSUCCESS: All individual steps completed successfully!")
else:
    print("WARNING: Skipping individual step testing due to missing dependencies")

## 4. Visualize Complete Preprocessing Pipeline

In [None]:
if opencv_available and len(test_images) > 0:
    # Visualize the complete preprocessing pipeline
    test_image = test_images[0]
    
    print("Visualizing complete preprocessing pipeline...")
    visualize_preprocessing_steps(test_image, preprocessor)
    
    print("SUCCESS: Pipeline visualization completed!")
else:
    print("WARNING: Skipping pipeline visualization due to missing dependencies")

## 5. Test Complete Pipeline on Multiple Images

In [None]:
if opencv_available and len(test_images) > 0:
    # Process all test images
    processed_images = []
    
    print("Processing all test images through complete pipeline...")
    
    for i, img in enumerate(test_images):
        print(f"Processing image {i+1}...")
        processed = preprocessor.preprocess(img)
        processed_images.append(processed)
        print(f"  Input: {img.shape} -> Output: {processed.shape}")
    
    # Compare before and after
    titles = [f"Image {i+1}" for i in range(len(test_images))]
    compare_before_after(test_images, processed_images, titles)
    
    print(f"\nSUCCESS: Successfully processed {len(processed_images)} images!")
    print(f"All outputs have consistent shape: {processed_images[0].shape}")
else:
    print("WARNING: Skipping multi-image processing due to missing dependencies")

## 6. Robustness Testing

In [None]:
if opencv_available:
    print("Running robustness tests...")
    print("=" * 50)
    
    # Test with various image sizes and configurations
    robustness_passed = test_preprocessing_robustness(preprocessor, n_test_images=10)
    
    if robustness_passed:
        print("\nSUCCESS: All robustness tests passed!")
    else:
        print("\nFAILURE: Some robustness tests failed!")
else:
    print("WARNING: Skipping robustness testing due to missing OpenCV")

## 7. Unit Tests

In [None]:
if opencv_available:
    print("Running unit tests for individual preprocessing functions...")
    print("=" * 60)
    
    # Run all unit tests
    all_tests_passed = run_all_tests()
    
    if all_tests_passed:
        print("\nSUCCESS: All unit tests passed! The preprocessing pipeline is working correctly.")
    else:
        print("\nFAILURE: Some unit tests failed! Please check the implementation.")
else:
    print("WARNING: Skipping unit tests due to missing OpenCV")

## 8. Performance Analysis

In [None]:
if opencv_available and len(test_images) > 0:
    import time
    
    print("Analyzing preprocessing performance...")
    print("=" * 40)
    
    # Test processing speed
    test_image = test_images[0]
    n_iterations = 10
    
    # Time individual steps
    steps = [
        ("Border Cropping", lambda img: preprocessor.crop_black_borders(img)),
        ("CLAHE", lambda img: preprocessor.apply_clahe(img)),
        ("Ben Graham", lambda img: preprocessor.ben_graham_normalization(img)),
        ("Resize & Crop", lambda img: preprocessor.resize_and_crop(img)),
        ("Complete Pipeline", lambda img: preprocessor.preprocess(img))
    ]
    
    for step_name, step_func in steps:
        start_time = time.time()
        
        for _ in range(n_iterations):
            if step_name == "Complete Pipeline":
                _ = step_func(test_image)
            else:
                # For individual steps, we need appropriate input
                if step_name == "Border Cropping":
                    _ = step_func(test_image)
                elif step_name == "CLAHE":
                    cropped = preprocessor.crop_black_borders(test_image)
                    _ = step_func(cropped)
                elif step_name == "Ben Graham":
                    cropped = preprocessor.crop_black_borders(test_image)
                    clahe_applied = preprocessor.apply_clahe(cropped)
                    _ = step_func(clahe_applied)
                elif step_name == "Resize & Crop":
                    cropped = preprocessor.crop_black_borders(test_image)
                    clahe_applied = preprocessor.apply_clahe(cropped)
                    normalized = preprocessor.ben_graham_normalization(clahe_applied)
                    _ = step_func(normalized)
        
        end_time = time.time()
        avg_time = (end_time - start_time) / n_iterations
        
        print(f"{step_name:20s}: {avg_time*1000:.2f} ms per image")
    
    # Calculate images per second for complete pipeline
    start_time = time.time()
    for _ in range(n_iterations):
        _ = preprocessor.preprocess(test_image)
    end_time = time.time()
    
    total_time = end_time - start_time
    images_per_second = n_iterations / total_time
    
    print(f"\nThroughput: {images_per_second:.2f} images/second")
    print(f"Time per image: {1000/images_per_second:.2f} ms")
    
    print("\nSUCCESS: Performance analysis completed!")
else:
    print("WARNING: Skipping performance analysis due to missing dependencies")

## 9. Save Test Results

In [None]:
if opencv_available and len(test_images) > 0:
    # Create results directory
    results_dir = Path("../results/preprocessing_tests")
    results_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"Saving test results to {results_dir}...")
    
    # Save processed test images
    for i, (original, processed) in enumerate(zip(test_images, processed_images)):
        # Save original
        original_path = results_dir / f"test_image_{i+1}_original.png"
        save_image(original, str(original_path))
        
        # Save processed
        processed_path = results_dir / f"test_image_{i+1}_processed.png"
        save_image(processed, str(processed_path))
        
        print(f"  Saved test image {i+1} pair")
    
    # Save pipeline visualization
    visualization_path = results_dir / "pipeline_visualization.png"
    visualize_preprocessing_steps(test_images[0], preprocessor, str(visualization_path))
    
    print(f"\nSUCCESS: Test results saved to: {results_dir}")
    print(f"Files saved:")
    for file_path in results_dir.glob("*.png"):
        print(f"  - {file_path.name}")
else:
    print("WARNING: Skipping result saving due to missing dependencies")

## Summary

### Preprocessing Pipeline Implementation Complete!

**Implemented Components:**
1. **Black Border Cropping** - Removes uninformative borders
2. **CLAHE** - Enhances local contrast for better lesion visibility
3. **Ben Graham Normalization** - Corrects illumination variations
4. **Resize & Crop** - Standardizes to 224x224 for model input

**Key Features:**
- Vectorized NumPy/OpenCV operations for speed
- Comprehensive unit tests for each component
- Performance analysis and robustness testing
- Visualization tools for pipeline validation
- Configuration-driven parameters from YAML

**Next Steps:**
1. Install required packages: `uv add opencv-python albumentations`
2. Add real retinal image datasets to `data/raw/`
3. Run this notebook with real data for validation
4. Proceed to Step 2.2: Data Augmentation & Dataset Implementation

In [None]:
# Final status check
print("\n" + "="*60)
print("PREPROCESSING PIPELINE STATUS")
print("="*60)

if opencv_available:
    print("SUCCESS: OpenCV: Available")
    print("SUCCESS: Preprocessing module: Implemented")
    print("SUCCESS: Utility functions: Implemented")
    print("SUCCESS: Unit tests: Available")
    print("SUCCESS: Performance testing: Available")
    print("SUCCESS: Visualization tools: Available")
    print("\nREADY FOR REAL DATA TESTING!")
else:
    print("ERROR: OpenCV: Not available")
    print("WARNING: Please install: uv add opencv-python")
    print("WARNING: Then restart this notebook")

print("\nTo proceed:")
print("1. Install missing packages if any")
print("2. Add real retinal images to data/raw/")
print("3. Re-run this notebook with real data")
print("4. Move to Step 2.2: Data Augmentation & Dataset Classes")