# 3D Medical Image Data Analysis

This notebook analyzes the 3D NIfTI dataset to extract all necessary information for pipeline configuration.
We will determine:
- Number of images and labels
- Image dimensions (depth, height, width)
- Number of channels
- Number of classes and label mapping
- Data types and value ranges
- Memory requirements
- Consistency checks

In [None]:
# ========================
# Import Libraries
# ========================
import nibabel as nib
import numpy as np
import pandas as pd
from pathlib import Path
import os
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

In [None]:
# ========================
# Setup Paths
# ========================
BASE_DIR = Path('C:/FeatureEx')
IMAGES_DIR = BASE_DIR / 'imagesTr' / 'imagesTr'
LABELS_DIR = BASE_DIR / 'labelsTr' / 'labelsTr'

print(f"Images directory: {IMAGES_DIR}")
print(f"Labels directory: {LABELS_DIR}")
print(f"\nImages exist: {IMAGES_DIR.exists()}")
print(f"Labels exist: {LABELS_DIR.exists()}")

In [None]:
# ========================
# Get File Lists
# ========================
image_files = sorted([f for f in IMAGES_DIR.glob('*.nii*')])
label_files = sorted([f for f in LABELS_DIR.glob('*.nii*')])

print(f"Found {len(image_files)} image files")
print(f"Found {len(label_files)} label files")

# Display first few files
print(f"\nFirst 5 images:")
for f in image_files[:5]:
    print(f"  {f.name}")

print(f"\nFirst 5 labels:")
for f in label_files[:5]:
    print(f"  {f.name}")

In [None]:
# ========================
# Analyze First Image
# ========================
print("Analyzing first image file...\n")

first_image_path = image_files[0]
print(f"File: {first_image_path.name}")
print(f"Size: {first_image_path.stat().st_size / (1024**2):.2f} MB")

# Load with nibabel
img_nib = nib.load(first_image_path)
img_data = img_nib.get_fdata()

print(f"\nNIfTI Header Information:")
print(f"  Shape: {img_nib.shape}")
print(f"  Data type: {img_data.dtype}")
print(f"  Affine matrix shape: {img_nib.affine.shape}")
print(f"  Voxel size: {nib.aff2axcodes(img_nib.affine)}")

print(f"\nData Array Information:")
print(f"  Array shape: {img_data.shape}")
print(f"  Array dtype: {img_data.dtype}")
print(f"  Min value: {img_data.min():.6f}")
print(f"  Max value: {img_data.max():.6f}")
print(f"  Mean value: {img_data.mean():.6f}")
print(f"  Std value: {img_data.std():.6f}")

# Determine dimensions
if len(img_data.shape) == 3:
    depth, height, width = img_data.shape
    num_channels = 1
    print(f"\nImage Configuration:")
    print(f"  Channels: {num_channels}")
    print(f"  Depth (Z): {depth}")
    print(f"  Height (Y): {height}")
    print(f"  Width (X): {width}")
elif len(img_data.shape) == 4:
    num_channels, depth, height, width = img_data.shape
    print(f"\nImage Configuration:")
    print(f"  Channels: {num_channels}")
    print(f"  Depth (Z): {depth}")
    print(f"  Height (Y): {height}")
    print(f"  Width (X): {width}")
else:
    print(f"  Unexpected shape: {img_data.shape}")

In [None]:
# ========================
# Analyze All Images
# ========================
print(f"Analyzing all {len(image_files)} image files...\n")

image_info = []

for idx, img_path in enumerate(image_files, 1):
    try:
        img_nib = nib.load(img_path)
        img_data = img_nib.get_fdata()
        
        info = {
            'filename': img_path.name,
            'shape': img_data.shape,
            'dtype': img_data.dtype,
            'min': img_data.min(),
            'max': img_data.max(),
            'mean': img_data.mean(),
            'size_mb': img_path.stat().st_size / (1024**2)
        }
        image_info.append(info)
        
        if idx % 10 == 0:
            print(f"  Processed {idx}/{len(image_files)} images")
    except Exception as e:
        print(f"  ERROR loading {img_path.name}: {e}")

print(f"\nSuccessfully analyzed {len(image_info)} images\n")

# Convert to DataFrame for analysis
df_images = pd.DataFrame(image_info)
print("Image Statistics:")
print(df_images.head(10))

In [None]:
# ========================
# Check Shape Consistency
# ========================
print("\nShape Consistency Check:")
print("="*70)

# Get unique shapes
unique_shapes = df_images['shape'].unique()
print(f"Number of unique shapes: {len(unique_shapes)}")
print(f"Shapes found:")
for shape in unique_shapes:
    count = (df_images['shape'] == shape).sum()
    print(f"  {shape}: {count} images")

# Most common shape
most_common_shape = df_images['shape'].value_counts().index[0]
print(f"\nMost common shape: {most_common_shape}")

# Get dimensions from most common shape
if len(most_common_shape) == 3:
    depth, height, width = most_common_shape
    num_channels = 1
elif len(most_common_shape) == 4:
    num_channels, depth, height, width = most_common_shape

print(f"\n3D Configuration:")
print(f"  Channels: {num_channels}")
print(f"  Depth (Z): {depth}")
print(f"  Height (Y): {height}")
print(f"  Width (X): {width}")

In [None]:
# ========================
# Analyze All Labels
# ========================
print(f"\nAnalyzing all {len(label_files)} label files...\n")

label_info = []
all_unique_labels = set()

for idx, label_path in enumerate(label_files, 1):
    try:
        label_nib = nib.load(label_path)
        label_data = label_nib.get_fdata()
        
        # Get unique labels in this file
        unique_labels = np.unique(label_data)
        all_unique_labels.update(unique_labels)
        
        info = {
            'filename': label_path.name,
            'shape': label_data.shape,
            'dtype': label_data.dtype,
            'min': label_data.min(),
            'max': label_data.max(),
            'unique_labels': len(unique_labels),
            'size_mb': label_path.stat().st_size / (1024**2)
        }
        label_info.append(info)
        
        if idx % 10 == 0:
            print(f"  Processed {idx}/{len(label_files)} labels")
    except Exception as e:
        print(f"  ERROR loading {label_path.name}: {e}")

print(f"\nSuccessfully analyzed {len(label_info)} label files\n")

# Convert to DataFrame
df_labels = pd.DataFrame(label_info)
print("Label Statistics:")
print(df_labels.head(10))

In [None]:
# ========================
# Label Analysis
# ========================
print("\nLabel Classes Found:")
print("="*70)

all_unique_labels = sorted(list(all_unique_labels))
print(f"Unique labels across all files: {all_unique_labels}")
print(f"Number of classes: {len(all_unique_labels)}")

# Label mapping
print(f"\nLabel Mapping:")
for i, label in enumerate(all_unique_labels):
    if label == 0:
        print(f"  {label} = Background")
    else:
        print(f"  {label} = Class {int(label)}")

# Count frequency
print(f"\nLabel Distribution Across Dataset:")
label_counter = Counter()

for label_path in label_files:
    try:
        label_nib = nib.load(label_path)
        label_data = label_nib.get_fdata()
        unique_labels_in_file = np.unique(label_data)
        for label in unique_labels_in_file:
            label_counter[int(label)] += 1
    except:
        pass

for label in sorted(label_counter.keys()):
    count = label_counter[label]
    pct = (count / len(label_files)) * 100
    if label == 0:
        print(f"  Background (0): {count}/{len(label_files)} files ({pct:.1f}%)")
    else:
        print(f"  Class {label}: {count}/{len(label_files)} files ({pct:.1f}%)")

In [None]:
# ========================
# Check Image-Label Matching
# ========================
print("\nImage-Label Matching:")
print("="*70)

print(f"Number of images: {len(image_files)}")
print(f"Number of labels: {len(label_files)}")

# Extract base names for matching
image_basenames = {f.stem: f for f in image_files}
label_basenames = {f.stem: f for f in label_files}

print(f"\nImage basenames (first 5):")
for i, name in enumerate(list(image_basenames.keys())[:5]):
    print(f"  {name}")

print(f"\nLabel basenames (first 5):")
for i, name in enumerate(list(label_basenames.keys())[:5]):
    print(f"  {name}")

# Check if names match
matching_pairs = set(image_basenames.keys()) & set(label_basenames.keys())
print(f"\nMatching image-label pairs: {len(matching_pairs)}/{len(image_files)}")

if len(matching_pairs) != len(image_files):
    print(f"\n⚠️  WARNING: Not all images have matching labels!")
    missing_in_labels = set(image_basenames.keys()) - set(label_basenames.keys())
    if missing_in_labels:
        print(f"   Images without labels: {len(missing_in_labels)}")
        print(f"   Examples: {list(missing_in_labels)[:5]}")
else:
    print(f"✓ All images have matching labels!")

In [None]:
# ========================
# Memory Analysis
# ========================
print("\nMemory Analysis:")
print("="*70)

# Single image memory
if num_channels == 1:
    single_image_bytes = depth * height * width * 4  # assuming float32
else:
    single_image_bytes = num_channels * depth * height * width * 4

single_image_mb = single_image_bytes / (1024**2)
total_dataset_mb = single_image_mb * len(image_files)
total_dataset_gb = total_dataset_mb / 1024

print(f"Per-image memory (float32):")
print(f"  Bytes: {single_image_bytes:,}")
print(f"  MB: {single_image_mb:.2f}")

print(f"\nTotal dataset memory:")
print(f"  MB: {total_dataset_mb:.2f}")
print(f"  GB: {total_dataset_gb:.2f}")

print(f"\nEstimated batch memory (batch_size=2):")
batch_size = 2
batch_mb = single_image_mb * batch_size
print(f"  MB: {batch_mb:.2f}")

In [None]:
# ========================
# CONFIGURATION SUMMARY
# ========================
print("\n" + "="*70)
print("3D PIPELINE CONFIGURATION SUMMARY")
print("="*70 + "\n")

config = {
    'dataset': {
        'num_images': len(image_files),
        'num_labels': len(label_files),
        'matching_pairs': len(matching_pairs),
    },
    'image_dimensions': {
        'channels': num_channels,
        'depth': depth,
        'height': height,
        'width': width,
        'shape': f"({num_channels}, {depth}, {height}, {width})" if num_channels > 1 else f"({depth}, {height}, {width})",
    },
    'labels': {
        'unique_labels': list(all_unique_labels),
        'num_classes': len(all_unique_labels),
        'has_background': 0 in all_unique_labels,
    },
    'data_types': {
        'image_dtype': str(df_images['dtype'].iloc[0]),
        'label_dtype': str(df_labels['dtype'].iloc[0]),
    },
    'memory': {
        'per_image_mb': round(single_image_mb, 2),
        'total_dataset_mb': round(total_dataset_mb, 2),
        'total_dataset_gb': round(total_dataset_gb, 2),
    }
}

for section, values in config.items():
    print(f"{section.upper()}:")
    for key, value in values.items():
        print(f"  {key}: {value}")
    print()

In [None]:
# ========================
# Export Configuration
# ========================
import json

config_output = {
    'dataset_info': {
        'num_images': int(len(image_files)),
        'num_labels': int(len(label_files)),
        'matching_pairs': int(len(matching_pairs)),
    },
    'image_dimensions': {
        'channels': int(num_channels),
        'depth': int(depth),
        'height': int(height),
        'width': int(width),
    },
    'labels': {
        'unique_labels': [int(x) for x in all_unique_labels],
        'num_classes': int(len(all_unique_labels)),
        'has_background': int(0 in all_unique_labels),
    },
    'data_types': {
        'image_dtype': str(df_images['dtype'].iloc[0]),
        'label_dtype': str(df_labels['dtype'].iloc[0]),
    },
    'memory_mb': {
        'per_image': round(single_image_mb, 2),
        'total_dataset': round(total_dataset_mb, 2),
    }
}

# Save as JSON
config_path = BASE_DIR / '3d_dataset_config.json'
with open(config_path, 'w') as f:
    json.dump(config_output, f, indent=2)

print(f"Configuration saved to: {config_path}")
print(f"\nConfiguration (JSON):")
print(json.dumps(config_output, indent=2))

In [None]:
# ========================
# Sample a Few Images
# ========================
print("\nDetailed Sample Analysis:")
print("="*70)

for i in range(min(3, len(image_files))):
    img_path = image_files[i]
    print(f"\nImage {i+1}: {img_path.name}")
    
    img_nib = nib.load(img_path)
    img_data = img_nib.get_fdata()
    
    print(f"  Shape: {img_data.shape}")
    print(f"  DType: {img_data.dtype}")
    print(f"  Range: [{img_data.min():.4f}, {img_data.max():.4f}]")
    print(f"  Mean: {img_data.mean():.4f}, Std: {img_data.std():.4f}")
    
    # Check for matching label
    label_name = img_path.stem
    matching_label = LABELS_DIR / f"{label_name}.nii"
    if not matching_label.exists():
        matching_label = LABELS_DIR / f"{label_name}.nii.gz"
    
    if matching_label.exists():
        label_nib = nib.load(matching_label)
        label_data = label_nib.get_fdata()
        print(f"  Label file: {matching_label.name}")
        print(f"  Label shape: {label_data.shape}")
        print(f"  Label DType: {label_data.dtype}")
        print(f"  Unique labels: {sorted(np.unique(label_data).astype(int).tolist())}")
    else:
        print(f"  Label file: NOT FOUND")