In [2]:
import random
import re
from pathlib import Path
import numpy as np
import nibabel as nib
import pydicom
from PIL import Image

In [3]:
# Input paths
dicom_dir = Path(r"M:\ElbowProject\data\_2D Lumify Elbow Data\dcm_files")
nifti_dir = Path(r"M:\ElbowProject\data\_2D Lumify Elbow Data\masks")

# Output base directories
output_base = Path(r"M:\ElbowProject\data\_2D Lumify Elbow Data")
train_dir = output_base / "TRAIN"
test_dir = output_base / "TEST"

subfolders = ["imgs", "effusion_masks", "humerus_masks", "effusion_humerus_masks"]
# Create train/test subfolders
for base in [train_dir, test_dir]:
    for sub in subfolders:
        (base / sub).mkdir(parents=True, exist_ok=True)

In [5]:
# Valid subject list
keep_ids_2D = [
    '88010', '88015', '88017', '88022', '88023', '88025', '88026', '88037',
    '88055', '88058', '88068', '88072', '88074', '88075', '88077', '88078',
    '88090', '88091', '88094', '88095', '88096', '88098', '88099', '88105',
    '88106', '88112', '88114', '88118', '88119', '88122', '88132', '88133',
    '88135', '88141', '88145', '88150', '88152', '88154', '88158', '88162',
    '88170', '88172'
]
# spliting the data into training and testing sets per subject
random.seed(42)
random.shuffle(keep_ids_2D)
split_idx = int(len(keep_ids_2D) * 0.8)
train_ids = set(keep_ids_2D[:split_idx])
test_ids = set(keep_ids_2D[split_idx:])

print(f"Training: {len(train_ids)} | Testing: {len(test_ids)}")

Training: 33 | Testing: 9


In [6]:
for dicom_file in dicom_dir.iterdir():
    if not dicom_file.is_file():
        continue

    match = re.match(r"(W\d{5}|\d{5})", dicom_file.name)
    if not match:
        print(f"❌ Could not extract ID from {dicom_file.name}")
        continue

    file_id = match.group()
    if file_id not in keep_ids_2D:
        continue

    subset_dir = train_dir if file_id in train_ids else test_dir
    stem = dicom_file.stem

    # Match mask
    nii_path = nifti_dir / f"{stem}.nii.gz"
    if not nii_path.exists():
        nii_path = nifti_dir / f"{stem}.nii"
    if not nii_path.exists():
        print(f"❌ No matching NIfTI file for {stem}")
        continue

    # Load DICOM
    ds = pydicom.dcmread(dicom_file)
    dicom_frames = ds.pixel_array

    if dicom_frames.ndim == 2:
        dicom_frames = dicom_frames[np.newaxis, ...]
    elif dicom_frames.ndim == 4 and dicom_frames.shape[-1] == 3:
        dicom_frames = dicom_frames[..., :3]

    # Load mask
    mask_data = nib.load(nii_path).get_fdata()
    if mask_data.ndim == 2:
        mask_data = mask_data[np.newaxis, ...]
    else:
        mask_data = np.transpose(mask_data, (2, 1, 0))

    if dicom_frames.shape[:3] != mask_data.shape:
        print(f"⚠️ Shape mismatch: {stem} — DICOM: {dicom_frames.shape}, Mask: {mask_data.shape}")
        continue

    # Slice-wise processing
    for i in range(len(dicom_frames)):
        dcm_frame = dicom_frames[i]

        if dcm_frame.ndim == 3 and dcm_frame.shape[-1] == 3:
            dcm_frame = (
                0.2989 * dcm_frame[..., 0] +
                0.5870 * dcm_frame[..., 1] +
                0.1140 * dcm_frame[..., 2]
            ).astype(np.float32)

        norm_img = Image.fromarray(
            ((dcm_frame - dcm_frame.min()) / dcm_frame.ptp() * 255).astype(np.uint8)
        )

        # Masks
        mask_slice = mask_data[i].astype(np.uint8)
        humerus = (mask_slice == 1).astype(np.uint8)
        effusion = (mask_slice == 6).astype(np.uint8)

        multi_mask = np.zeros_like(mask_slice, dtype=np.uint8)
        multi_mask[humerus == 1] = 1
        multi_mask[effusion == 1] = 2

        # Save
        fname = f"{stem}_frame_{i:03d}.png"
        norm_img.save(subset_dir / "imgs" / fname)
        Image.fromarray(effusion, mode="L").save(subset_dir / "effusion_masks" / fname)
        Image.fromarray(humerus, mode="L").save(subset_dir / "humerus_masks" / fname)
        Image.fromarray(multi_mask, mode="L").save(subset_dir / "effusion_humerus_masks" / fname)

    print(f"✅ Processed: {stem} ({'train' if subset_dir==train_dir else 'test'})")

print("🎉 All done!")

✅ Processed: 88010-left-elbow-dor (train)
✅ Processed: 88010-left-elbow-vul (train)
✅ Processed: 88015-left-elbow-dor (test)
✅ Processed: 88015-left-elbow-vul (test)
✅ Processed: 88017-right-elbow-dor (train)
✅ Processed: 88017-right-elbow-dor2 (train)
✅ Processed: 88017-right-elbow-vul (train)
✅ Processed: 88022-left-elbow-dor (train)
✅ Processed: 88022-left-elbow-vul (train)
✅ Processed: 88023-right-elbow-dor (train)
✅ Processed: 88023-right-elbow-vul (train)
✅ Processed: 88025-right-elbow-vul (test)
✅ Processed: 88026-right-elbow-dor (test)
✅ Processed: 88026-right-elbow-vul (test)
✅ Processed: 88037-left-elbow-dor (test)
✅ Processed: 88055-right-elbow-dor (test)
✅ Processed: 88055-right-elbow-vul (test)
✅ Processed: 88058-right-elbow-dor (train)
✅ Processed: 88058-right-elbow-vul (train)
✅ Processed: 88068-left-elbow-dor (train)
✅ Processed: 88068-left-elbow-vul (train)
✅ Processed: 88072-left-elbow-dor (train)
✅ Processed: 88072-left-elbow-vul (train)
✅ Processed: 88074-left-elbow

In [10]:
from pathlib import Path
from PIL import Image
import numpy as np

# === List of mask folders to check ===
train_mask_folders = [
    Path(r"M:\ElbowProject\data\_2D Lumify Elbow Data\TRAIN\humerus_masks"),
    Path(r"M:\ElbowProject\data\_2D Lumify Elbow Data\TRAIN\effusion_masks"),
    Path(r"M:\ElbowProject\data\_2D Lumify Elbow Data\TRAIN\effusion_humerus_masks")
]
print("TRAINING SET MASKS")

for folder in train_mask_folders:
    empty_count = 0
    total = 0

    for mask_file in folder.glob("*.png"):
        img = Image.open(mask_file).convert("L")
        arr = np.array(img)
        total += 1
        if np.all(arr == 0):
            empty_count += 1
    print(f"\nFolder: {folder.name}")
    print(f"Total masks:       {total}")
    print(f"Empty (all zero): {empty_count}")
    print(f"% Empty:          {empty_count / total * 100:.2f}%")


TRAINING SET MASKS

Folder: humerus_masks
Total masks:       7115
Empty (all zero): 854
% Empty:          12.00%

Folder: effusion_masks
Total masks:       7115
Empty (all zero): 4034
% Empty:          56.70%

Folder: effusion_humerus_masks
Total masks:       7115
Empty (all zero): 854
% Empty:          12.00%


In [11]:
# === List of mask folders to check ===
test_mask_folders = [
    Path(r"M:\ElbowProject\data\_2D Lumify Elbow Data\TEST\humerus_masks"),
    Path(r"M:\ElbowProject\data\_2D Lumify Elbow Data\TEST\effusion_masks"),
    Path(r"M:\ElbowProject\data\_2D Lumify Elbow Data\TEST\effusion_humerus_masks")
]
print("TESTING SET MASKS")

for folder in test_mask_folders:
    empty_count = 0
    total = 0

    for mask_file in folder.glob("*.png"):
        img = Image.open(mask_file).convert("L")
        arr = np.array(img)
        total += 1
        if np.all(arr == 0):
            empty_count += 1
    print(f"\nFolder: {folder.name}")
    print(f"Total masks:       {total}")
    print(f"Empty (all zero): {empty_count}")
    print(f"% Empty:          {empty_count / total * 100:.2f}%")

TESTING SET MASKS

Folder: humerus_masks
Total masks:       1600
Empty (all zero): 193
% Empty:          12.06%

Folder: effusion_masks
Total masks:       1600
Empty (all zero): 1055
% Empty:          65.94%

Folder: effusion_humerus_masks
Total masks:       1600
Empty (all zero): 193
% Empty:          12.06%
