# Data Exploration: Rotterdam Dataset

This notebook explores the Rotterdam collaboration dataset for pipeline development and feasibility testing.

**Note:** This data is for exploratory purposes only and will not be used for publication results.

In [None]:
import sys
sys.path.append('..')

import numpy as np
import matplotlib.pyplot as plt
import nibabel as nib
from pathlib import Path
import pandas as pd
from IPython.display import display

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

## 1. Dataset Overview

Let's start by examining the Rotterdam dataset structure and contents.

In [None]:
# Define data directory
data_dir = Path('../data/rotterdam')

# Check if directory exists
if not data_dir.exists():
    print(f"⚠️  Data directory not found: {data_dir}")
    print("Please place the Rotterdam data in the data/rotterdam/ directory.")
else:
    print(f"✓ Data directory found: {data_dir}")
    
    # List contents
    print("\nDirectory structure:")
    for item in sorted(data_dir.rglob('*')):
        if item.is_file():
            relative_path = item.relative_to(data_dir)
            print(f"  {relative_path}")

## 2. Load and Inspect Sample Volume

Load a sample CT volume and inspect its properties.

In [None]:
# Find first available image file
image_files = list(data_dir.rglob('*.nii.gz')) + list(data_dir.rglob('*.nii'))

if len(image_files) == 0:
    print("No NIfTI files found in the data directory.")
else:
    sample_file = image_files[0]
    print(f"Loading: {sample_file.name}")
    
    # Load volume
    img = nib.load(str(sample_file))
    volume = img.get_fdata()
    
    # Display properties
    print(f"\nVolume properties:")
    print(f"  Shape: {volume.shape}")
    print(f"  Data type: {volume.dtype}")
    print(f"  Voxel dimensions: {img.header.get_zooms()} mm")
    print(f"  Intensity range: [{volume.min():.1f}, {volume.max():.1f}]")
    print(f"  Mean intensity: {volume.mean():.1f}")
    print(f"  Std intensity: {volume.std():.1f}")

## 3. Visualize Sample Slices

Visualize axial, sagittal, and coronal slices from the CT volume.

In [None]:
if len(image_files) > 0:
    # Get middle slices
    axial_slice = volume[:, :, volume.shape[2] // 2]
    sagittal_slice = volume[volume.shape[0] // 2, :, :]
    coronal_slice = volume[:, volume.shape[1] // 2, :]
    
    # Create figure
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    
    # Axial view
    axes[0].imshow(axial_slice.T, cmap='gray', origin='lower')
    axes[0].set_title('Axial View')
    axes[0].axis('off')
    
    # Sagittal view
    axes[1].imshow(sagittal_slice.T, cmap='gray', origin='lower')
    axes[1].set_title('Sagittal View')
    axes[1].axis('off')
    
    # Coronal view
    axes[2].imshow(coronal_slice.T, cmap='gray', origin='lower')
    axes[2].set_title('Coronal View')
    axes[2].axis('off')
    
    plt.tight_layout()
    plt.show()

## 4. Intensity Distribution Analysis

Analyze the intensity distribution (Hounsfield Units) in the CT volume.

In [None]:
if len(image_files) > 0:
    # Plot histogram
    fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    
    # Flatten volume and remove extreme outliers for visualization
    flat_volume = volume.flatten()
    p1, p99 = np.percentile(flat_volume, [1, 99])
    
    ax.hist(flat_volume, bins=100, range=(p1, p99), alpha=0.7, color='blue')
    ax.axvline(x=volume.mean(), color='red', linestyle='--', label=f'Mean: {volume.mean():.1f}')
    ax.set_xlabel('Intensity (HU)')
    ax.set_ylabel('Frequency')
    ax.set_title('CT Intensity Distribution')
    ax.legend()
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 5. Dataset Summary

Create a summary table of all available cases.

In [None]:
if len(image_files) > 0:
    # Collect metadata for all volumes
    metadata = []
    
    for img_file in image_files[:10]:  # Limit to first 10 for quick exploration
        img = nib.load(str(img_file))
        vol = img.get_fdata()
        
        metadata.append({
            'filename': img_file.name,
            'shape_x': vol.shape[0],
            'shape_y': vol.shape[1],
            'shape_z': vol.shape[2],
            'spacing_x': img.header.get_zooms()[0],
            'spacing_y': img.header.get_zooms()[1],
            'spacing_z': img.header.get_zooms()[2],
            'min_intensity': vol.min(),
            'max_intensity': vol.max(),
            'mean_intensity': vol.mean(),
        })
    
    # Create DataFrame
    df = pd.DataFrame(metadata)
    
    print(f"Dataset Summary (showing {len(df)} cases):")
    display(df)
    
    print("\nSummary Statistics:")
    display(df.describe())

## 6. Next Steps

Based on this exploration:

1. **Data preprocessing pipeline**: Define appropriate intensity windowing (HU range)
2. **Resampling strategy**: Determine target spacing for consistent voxel dimensions
3. **Patch extraction**: Design patch size and sampling strategy for training
4. **Augmentation**: Plan data augmentation techniques suitable for CT angiography

These insights will inform the preprocessing pipeline for the main ImageCAS dataset.