In [28]:
import nibabel as nib
import numpy as np
import pandas as pd
import os
import shutil
import json
from pathlib import Path

# --- GLOBAL CONFIGURATION ---
DATASET_NAME = "Dataset601_WMH"  # Must follow 'DatasetXXX_Name' format
SOURCE_DIR = 'data/WMH'
OUTPUT_DIR = os.getenv('nnUNet_raw', "data/nnUNet_raw")

# Define Paths
target_base = Path(OUTPUT_DIR) / DATASET_NAME
imagesTr = target_base / "imagesTr"
labelsTr = target_base / "labelsTr"
imagesTs = target_base / "imagesTs"

# Create directories
for p in [imagesTr, labelsTr, imagesTs]:
    p.mkdir(parents=True, exist_ok=True)

print(f"Setup complete. Target: {target_base}")

Setup complete. Target: /home/student/sebastian_ma/Learning-to-Look-Closer/data/nnUNet_raw/Dataset601_WMH


In [29]:
import os
import nibabel as nib
import numpy as np
import pandas as pd
from pathlib import Path

CHECK_REGISTRATION = True

REQ_MODALITIES = ["pre/FLAIR.nii.gz", "pre/T1.nii.gz"]
REQ_LABEL = "wmh.nii.gz"

results = []

print(f"--- Starting WMH Dataset Audit: {SOURCE_DIR} ---")

# The WMH structure is: Institute/Scanner/CaseID/
# We need to walk through the first two levels to get to CaseID
for inst_dir in Path(f'{SOURCE_DIR}/train').iterdir():
    if not inst_dir.is_dir():
        continue

    for case_dir in inst_dir.iterdir():
        if not case_dir.is_dir():
            continue

        case_id = case_dir.name
        report = {"case_id": case_id, "missing": [], "issue": "None"}

        # 1. Check for Missing Files
        for req in REQ_MODALITIES:
            if not (case_dir / req).exists():
                report["missing"].append(req)

        if not (case_dir / REQ_LABEL).exists():
            report["missing"].append(REQ_LABEL)

        # 2. Geometry & Registration Check
        if CHECK_REGISTRATION and not report["missing"]:
            try:
                # Reference: FLAIR (Manual standard is defined on FLAIR space)
                ref_path = case_dir / REQ_MODALITIES[0]
                ref_img = nib.load(str(ref_path))
                ref_affine = ref_img.affine
                ref_shape = ref_img.shape

                # Check T1 against FLAIR
                t1_img = nib.load(str(case_dir / REQ_MODALITIES[1]))
                if not np.array_equal(t1_img.shape, ref_shape):
                    report["issue"] = "Shape Mismatch (T1 vs FLAIR)"
                elif not np.allclose(t1_img.affine, ref_affine, atol=1e-3):
                    report["issue"] = "Registration Drift (T1 vs FLAIR)"

                # Check Label against FLAIR
                label_img = nib.load(str(case_dir / REQ_LABEL))
                if not np.array_equal(label_img.shape, ref_shape):
                    report["issue"] = "Shape Mismatch (Label vs FLAIR)"
                elif not np.allclose(label_img.affine, ref_affine, atol=1e-3):
                    report["issue"] = "Registration Drift (Label vs FLAIR)"

            except Exception as e:
                report["issue"] = f"Error loading NIfTI: {str(e)}"

        results.append(report)

# --- Summary Report ---
df_audit = pd.DataFrame(results)
issues_df = df_audit[(df_audit['missing'].str.len() > 0)
                     | (df_audit['issue'] != "None")]

print(f"\nAudit Finished.")
print(f"Total Cases: {len(df_audit)}")
print(f"Clean Cases: {len(df_audit) - len(issues_df)}")
print(f"Cases with Issues: {len(issues_df)}")

if not issues_df.empty:
    print("\n--- Summary of Pitfalls Found ---")
    print(issues_df.to_string())
else:
    print("\nAll WMH cases passed geometry and modality checks!")

--- Starting WMH Dataset Audit: data/WMH ---

Audit Finished.
Total Cases: 60
Clean Cases: 60
Cases with Issues: 0

All WMH cases passed geometry and modality checks!


In [31]:
n_cases = 0

def clean_wmh_labels(label_path, output_path):
    # Load the manual reference standard (wmh.nii.gz)
    img = nib.load(label_path)
    data = img.get_fdata()

    # Reassign label 2 (Other pathology) to 0 (Background)
    # Label 1 (WMH) remains untouched
    data[data == 2] = 0

    # Save the cleaned label as a new NIfTI file
    cleaned_img = nib.Nifti1Image(
        data.astype(np.uint8), img.affine, img.header)
    nib.save(cleaned_img, output_path)
    
    
for inst_dir in Path(f'{SOURCE_DIR}/train').iterdir():
    if not inst_dir.is_dir():
        continue

    for case_dir in inst_dir.iterdir():
        if not case_dir.is_dir():
            continue

        case_id = case_dir.name

        # Use pre-processed images: T1 aligned to FLAIR and FLAIR itself
        flair_src = f"{case_dir}/pre/FLAIR.nii.gz"
        t1_src = f"{case_dir}/pre/T1.nii.gz"
        label_src = f"{case_dir}/wmh.nii.gz"

        if os.path.exists(flair_src) and os.path.exists(label_src):
            formatted_id = f"WMH_{case_id}"

            # Copy Images with nnUNet naming convention (Case_Modality.nii.gz)
            shutil.copy(flair_src, os.path.join(
                imagesTr, f"{formatted_id}_0000.nii.gz"))
            shutil.copy(t1_src, os.path.join(
                imagesTr, f"{formatted_id}_0001.nii.gz"))

            # Copy Labels
            clean_wmh_labels(label_src, f"{labelsTr}/{formatted_id}.nii.gz")
        n_cases += 1


# --- Generate dataset.json ---
dataset_json = {
    "channel_names": {
        "0": "FLAIR",
        "1": "T1"
    },
    "labels": {
        "background": 0,
        "WMH": 1,
    },
    "numTraining": n_cases,
    "file_ending": ".nii.gz"
}

with open(os.path.join(target_base, "dataset.json"), 'w') as f:
    json.dump(dataset_json, f, indent=4)

print(f"Conversion complete. {n_cases} cases processed.")

Conversion complete. 60 cases processed.


In [15]:
print(f"Summary for {DATASET_NAME}:")
print(f"Training images: {len(list(imagesTr.glob('*_0000.nii.gz')))}")
print(f"Training labels: {len(list(labelsTr.glob('*.nii.gz')))}")
print(f"Test images:     {len(list(imagesTs.glob('*_0000.nii.gz')))}")

# Final directory structure check
!ls -R {target_base} | head -n 20

Summary for Dataset601_WMH:
Training images: 120
Training labels: 120
Test images:     0
/home/student/sebastian_ma/Learning-to-Look-Closer/data/nnUNet_raw/Dataset601_WMH:
dataset.json
imagesTr
imagesTs
labelsTr

/home/student/sebastian_ma/Learning-to-Look-Closer/data/nnUNet_raw/Dataset601_WMH/imagesTr:
WMH_000_0000.nii.gz
WMH_000_0001.nii.gz
WMH_001_0000.nii.gz
WMH_001_0001.nii.gz
WMH_002_0000.nii.gz
WMH_002_0001.nii.gz
WMH_003_0000.nii.gz
WMH_003_0001.nii.gz
WMH_004_0000.nii.gz
WMH_004_0001.nii.gz
WMH_005_0000.nii.gz
WMH_005_0001.nii.gz
WMH_006_0000.nii.gz
ls: write error: Broken pipe


--- Starting Dataset Audit: data/brainmetshare-3 ---

Audit Finished.
Total Cases: 156
Clean Cases: 156
Cases with Issues: 0

All cases passed geometry and modality checks!
