In [None]:
import nibabel as nib
import numpy as np
import pandas as pd
import os
import shutil
import json
from pathlib import Path

# --- GLOBAL CONFIGURATION ---
DATASET_NAME = "Dataset501_BrainMets"  # Must follow 'DatasetXXX_Name' format
SOURCE_DIR = 'data/brainmetshare-3'
OUTPUT_DIR = os.getenv('nnUNet_raw', "data/nnUNet_raw")

# Mapping modalities to nnU-Net suffixes
# 0000: T1 Gd, 0001: T1 Pre, 0002: FLAIR, 0003: BRAVO
MODALITY_MAP = {
    "t1_gd.nii.gz": "0000",
    "t1_pre.nii.gz": "0001",
    "flair.nii.gz": "0002",
    "bravo.nii.gz": "0003"
}

# Define Paths
target_base = Path(OUTPUT_DIR) / DATASET_NAME
imagesTr = target_base / "imagesTr"
labelsTr = target_base / "labelsTr"
imagesTs = target_base / "imagesTs"

# Create directories
for p in [imagesTr, labelsTr, imagesTs]:
    p.mkdir(parents=True, exist_ok=True)

print(f"Setup complete. Target: {target_base}")

Setup complete. Target: data/nnUNet_raw/Dataset501_BrainMets


In [7]:
# --- VALIDATION CONFIGURATION ---
CHECK_REGISTRATION = True  # Compares affine matrices and shapes
REQ_MODALITIES = list(MODALITY_MAP.keys())

results = []

print(f"--- Starting Dataset Audit: {SOURCE_DIR} ---")

for subset in ['train', 'test']:
    subset_path = Path(SOURCE_DIR) / subset
    if not subset_path.exists():
        continue

    cases = [d for d in subset_path.iterdir() if d.is_dir()]

    for case_dir in cases:
        case_id = case_dir.name
        report = {"case_id": case_id, "subset": subset,
                  "missing": [], "issue": "None"}

        # 1. Check for Missing Files
        found_files = [f.name for f in case_dir.glob("*.nii.gz")]
        for req in REQ_MODALITIES:
            if req not in found_files:
                report["missing"].append(req)

        # 2. Check Segmentation (for train)
        if subset == 'train' and "seg.nii.gz" not in found_files:
            report["issue"] = "Missing Label"

        # 3. Registration & Geometry Check
        if CHECK_REGISTRATION and not report["missing"]:
            try:
                # Load first modality as reference
                ref_path = case_dir / REQ_MODALITIES[0]
                ref_img = nib.load(str(ref_path))
                ref_affine = ref_img.affine
                ref_shape = ref_img.shape

                for other in REQ_MODALITIES[1:]:
                    other_img = nib.load(str(case_dir / other))

                    # Check if shapes match
                    if not np.array_equal(other_img.shape, ref_shape):
                        report["issue"] = f"Shape Mismatch ({other})"
                        break

                    # Check if affines match (tolerance for floating point)
                    if not np.allclose(other_img.affine, ref_affine, atol=1e-3):
                        report["issue"] = f"Registration Drift ({other})"
                        break
            except Exception as e:
                report["issue"] = f"Error loading NIfTI: {str(e)}"

        results.append(report)

# --- Summary Report ---
df_audit = pd.DataFrame(results)
issues_df = df_audit[(df_audit['missing'].str.len() > 0)
                     | (df_audit['issue'] != "None")]

print(f"\nAudit Finished.")
print(f"Total Cases: {len(df_audit)}")
print(f"Clean Cases: {len(df_audit) - len(issues_df)}")
print(f"Cases with Issues: {len(issues_df)}")

if not issues_df.empty:
    print("\n--- Summary of Pitfalls Found ---")
    print(issues_df[['case_id', 'subset', 'missing', 'issue']].to_string())
else:
    print("\nAll cases passed geometry and modality checks!")

--- Starting Dataset Audit: data/brainmetshare-3 ---

Audit Finished.
Total Cases: 156
Clean Cases: 156
Cases with Issues: 0

All cases passed geometry and modality checks!


In [2]:
def convert_subset(subset_name, target_img_dir, target_lab_dir=None):
    subset_path = Path(SOURCE_DIR) / subset_name
    case_dirs = [d for d in subset_path.iterdir() if d.is_dir()]

    print(f"Processing {subset_name} ({len(case_dirs)} cases)...")

    for case_dir in case_dirs:
        case_id = case_dir.name  # e.g., Mets_005

        # 1. Process Images
        for filename, suffix in MODALITY_MAP.items():
            src_file = case_dir / filename
            if src_file.exists():
                # nnU-Net format: CaseName_000X.nii.gz
                dest_file = target_img_dir / f"{case_id}_{suffix}.nii.gz"
                shutil.copy(src_file, dest_file)

        # 2. Process Segmentation (Labels) - only if in train and file exists
        if target_lab_dir:
            seg_file = case_dir / "seg.nii.gz"
            if seg_file.exists():
                dest_seg = target_lab_dir / f"{case_id}.nii.gz"
                shutil.copy(seg_file, dest_seg)


# Execute conversion
convert_subset("train", imagesTr, labelsTr)
convert_subset("test", imagesTs, None)

print("File copying complete.")

Processing train (105 cases)...
Processing test (51 cases)...
File copying complete.


In [3]:
dataset_json = {
    "channel_names": {
        "0": "T1_Gd",
        "1": "T1_pre",
        "2": "FLAIR",
        "3": "BRAVO"
    },
    "labels": {
        "background": 0,
        "metastasis": 1
    },
    "numTraining": len(list(labelsTr.glob("*.nii.gz"))),
    "file_ending": ".nii.gz",
    "overwrite_image_reader_writer": "NibabelIOWithCast"  # Optional, standard for NIfTI
}

with open(target_base / "dataset.json", "w") as f:
    json.dump(dataset_json, f, indent=4)

print(f"dataset.json generated at {target_base / 'dataset.json'}")

dataset.json generated at data/nnUNet_raw/Dataset501_BrainMets/dataset.json


In [4]:
print(f"Summary for {DATASET_NAME}:")
print(f"Training images: {len(list(imagesTr.glob('*_0000.nii.gz')))}")
print(f"Training labels: {len(list(labelsTr.glob('*.nii.gz')))}")
print(f"Test images:     {len(list(imagesTs.glob('*_0000.nii.gz')))}")

# Final directory structure check
!ls -R {target_base} | head -n 20

Summary for Dataset501_BrainMets:
Training images: 105
Training labels: 105
Test images:     51
data/nnUNet_raw/Dataset501_BrainMets:
dataset.json
imagesTr
imagesTs
labelsTr

data/nnUNet_raw/Dataset501_BrainMets/imagesTr:
Mets_005_0000.nii.gz
Mets_005_0001.nii.gz
Mets_005_0002.nii.gz
Mets_005_0003.nii.gz
Mets_010_0000.nii.gz
Mets_010_0001.nii.gz
Mets_010_0002.nii.gz
Mets_010_0003.nii.gz
Mets_011_0000.nii.gz
Mets_011_0001.nii.gz
Mets_011_0002.nii.gz
Mets_011_0003.nii.gz
Mets_013_0000.nii.gz


--- Starting Dataset Audit: data/brainmetshare-3 ---

Audit Finished.
Total Cases: 156
Clean Cases: 156
Cases with Issues: 0

All cases passed geometry and modality checks!
