In [None]:
# 1. Handle datasets
import io
import os
import cv2
import pydicom
import dicomsdl
import numpy as np
import pandas as pd
from PIL import Image
from glob import glob
import tifffile as tiff
import SimpleITK as sitk
from pathlib import Path
from tqdm.auto import tqdm
import multiprocessing as mp
from collections import Counter
from joblib import Parallel, delayed
from pydicom.pixel_data_handlers.util import apply_voi_lut

In [35]:
parent_dir = r"E:\rsna-breast-cancer-detection"
df = pd.read_csv(os.path.join(parent_dir, "train.csv"))
df.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True


In [45]:
RESIZE_TO = (512, 512)
SAVE_DIR = f"train_image_processed_cv2_{RESIZE_TO[0]}"

# Gather all .dcm paths
all_dcm_files = list(Path(os.path.join(parent_dir, "train_images")).rglob("*.dcm"))
fail_counter = Counter()

def dicom_file_to_array(path):
    dicom = pydicom.dcmread(path)
    data = dicom.pixel_array
    photometric = dicom.get("PhotometricInterpretation")  # Cache this once
    
    if photometric == "MONOCHROME1":
        data = np.amax(data) - data

    data = (data - data.min()) / (data.max() - data.min())
    data = cv2.resize(data, RESIZE_TO)
    return (data * 255).astype(np.uint8)

def process(path):
    try:
        parent_folder = path.parent.name
        save_subdir = os.path.join(parent_dir, SAVE_DIR, parent_folder)
        os.makedirs(save_subdir, exist_ok=True)

        processed_img = dicom_file_to_array(path)
        save_path = os.path.join(save_subdir, f"{path.stem}.png")
        cv2.imwrite(save_path, processed_img)

    except Exception as e:
        print(f"[ERROR] Failed: {path} — {e}")
        fail_counter["fail"] += 1

# Process with tqdm and joblib
Parallel(n_jobs=16, backend="loky", prefer="threads")(
    delayed(process)(path) for path in tqdm(all_dcm_files, 
                                            total=len(all_dcm_files))
)

print(f"✅ Done! Processed {len(all_dcm_files)} files,"
      f"with {fail_counter['fail']} failures.")

  0%|          | 0/54706 [00:00<?, ?it/s]

✅ Done! Processed 54706 files,with 0 failures.


We test using dicomsdl for 'test' data

In [55]:
RESIZE_TO = (512, 512)
SAVE_DIR = f"test_image_processed_cv2_{RESIZE_TO[0]}"

# Gather all .dcm paths
all_dcm_files = list(Path(os.path.join(parent_dir, "test_images")).rglob("*.dcm"))
fail_counter = Counter()

def dicom_file_to_array(path):
    dicom = dicomsdl.open(str(path))
    data = dicom.pixelData()
    photometric = dicom.getPixelDataInfo()['PhotometricInterpretation']  # Cache this once
    
    if photometric == "MONOCHROME1":
        data = 1 - data

    data = (data - data.min()) / (data.max() - data.min())
    data = cv2.resize(data, RESIZE_TO)
    return (data * 255).astype(np.uint8)

def process(path):
    try:
        parent_folder = path.parent.name
        save_subdir = os.path.join(parent_dir, SAVE_DIR, parent_folder)
        os.makedirs(save_subdir, exist_ok=True)

        processed_img = dicom_file_to_array(path)
        save_path = os.path.join(save_subdir, f"{path.stem}.png")
        cv2.imwrite(save_path, processed_img)

    except Exception as e:
        print(f"[ERROR] Failed: {path} — {e}")
        fail_counter["fail"] += 1

# Process with tqdm and joblib
Parallel(n_jobs=16, backend="loky", prefer="threads")(
    delayed(process)(path) for path in tqdm(all_dcm_files, 
                                            total=len(all_dcm_files))
)

print(f"✅ Done! Processed {len(all_dcm_files)} files,"
      f"with {fail_counter['fail']} failures.")

  0%|          | 0/4 [00:00<?, ?it/s]

✅ Done! Processed 4 files,with 0 failures.


Well the test only has 4 files. Lets try apply those with training datatset to see if it is faster (training + pydicom ~ 100 minutes)

In [None]:
RESIZE_TO = 1024
SAVE_DIR = f"train_image_processed_dicomsdl_{RESIZE_TO}"

# Gather all .dcm paths
all_dcm_files = list(Path(os.path.join(parent_dir, "train_images")).rglob("*.dcm"))
fail_counter = Counter()

def image_resize(image, width = None, height = None, inter = cv2.INTER_LINEAR):
    
    dim = None
    (h, w) = image.shape[:2]
    
    if width is None and height is None:
        return image

    if width is None:
        r = height / float(h)
        dim = (int(w * r), height)
    else:
        r = width / float(w)
        dim = (width, int(h * r))
    
    resized = cv2.resize(image, dim, interpolation=inter)
    return resized

def dicom_file_to_array(path):
    dicom = dicomsdl.open(str(path))
    data = dicom.pixelData()
    photometric = dicom.getPixelDataInfo()['PhotometricInterpretation']  # Cache this once
    
    if photometric == "MONOCHROME1":
        data = 1 - data

    data = (data - data.min()) / (data.max() - data.min())
    
    h, w = data.shape
    if w > h:
        data = image_resize(data, width=RESIZE_TO)
    else:
        data = image_resize(data, height=RESIZE_TO)
    
    return (data * 255).astype(np.uint8)

def process(path):
    try:
        parent_folder = path.parent.name
        save_subdir = os.path.join(parent_dir, SAVE_DIR, parent_folder)
        os.makedirs(save_subdir, exist_ok=True)

        processed_img = dicom_file_to_array(path)
        save_path = os.path.join(save_subdir, f"{path.stem}.png")
        cv2.imwrite(save_path, processed_img)

    except Exception as e: 
        print(f"[ERROR] Failed: {path} — {e}")
        fail_counter["fail"] += 1

# Process with tqdm and joblib
Parallel(n_jobs=16, backend="loky", prefer="threads")(
    delayed(process)(path) for path in tqdm(all_dcm_files, 
                                            total=len(all_dcm_files))
)

print(f"✅ Done! Processed {len(all_dcm_files)} files,"
      f"with {fail_counter['fail']} failures.")

  0%|          | 0/54706 [00:00<?, ?it/s]

✅ Done! Processed 54706 files,with 0 failures.


### So even with 1024, this code runs remarkably faster