In [2]:
import pandas as pd
import os
import glob
import numpy as np
import matplotlib.pyplot as plt
import nrrd
import pydicom as dicom
from PIL import Image

In [3]:
# NOTE: the downloaded dataset should be put into "[root]/raw"
root = "PUT YOUR PATH OF NCI-ISBI DATASET HERE"
root = "/home/yesindeed/Desktop/NCI-ISBI-2013"

img_path = os.path.join(root, "images")
mask_path = os.path.join(root, "masks")

if not os.path.exists(img_path):
    os.makedirs(img_path)

if not os.path.exists(mask_path):
    os.makedirs(mask_path)

# Process Training Set

In [4]:
import shutil

img_ids = []

train_raw_root = os.path.join(root, "raw", "ISBI-Prostate-Training", "manifest-ZqaK9xEy8795217829022780222")
train_seg_root = os.path.join(root, "raw", "NCI-ISBI-2013-Prostate-Challenge-Training")

df_train_raw = pd.read_csv(os.path.join(train_raw_root, "metadata.csv"))

for case_path, num_slices in zip(df_train_raw["File Location"].tolist(), df_train_raw["Number of Images"].tolist()):
    case = case_path.split("/")[2]

    mask_vol = nrrd.read(glob.glob(os.path.join(train_seg_root, f"{case}*.nrrd"))[0])[0]

    if mask_vol.shape[-1] != num_slices:
        print(f"case {case}: mask shape {mask_vol.shape} while num slices {num_slices}")
        continue

    for i in range(num_slices):
        mask = np.transpose(mask_vol[:, :, i])

        if np.max(mask) > 0:
            dcm_file_name = glob.glob(os.path.join(train_raw_root, case_path, f"*{i+1}.dcm"))[0]
            image = dicom.dcmread(dcm_file_name).pixel_array
            shutil.copy(dcm_file_name, os.path.join(root, "images", f"{case}_{i+1}.dcm"))

            mask = Image.fromarray(np.uint8(((mask >= 1)).astype(np.int8) * 255), "L")
            mask.save(os.path.join(root, "masks", f"{case}_{i+1}.png"))

            img_ids.append(f"{case}_{i+1}")

np.random.shuffle(img_ids)

df_train = pd.DataFrame(img_ids)
df_train.to_csv(os.path.join(root, "train.txt"), header=None, index=False, sep=" ")

case ProstateDx-01-0055: mask shape (400, 400, 23) while num slices 34


# Process Testing Set

In [5]:
img_ids = []

test_raw_root = os.path.join(root, "raw", "ISBI-Prostate-Testing", "manifest-WTWyB8IJ8830296727402453766")
test_seg_root = os.path.join(root, "raw", "NCI-ISBI-2013-Prostate-Challenge-Testing")

df_test_raw = pd.read_csv(os.path.join(test_raw_root, "metadata.csv"))

for case_path, num_slices in zip(df_test_raw["File Location"].tolist(), df_test_raw["Number of Images"].tolist()):
    case = case_path.split("/")[2]

    mask_vol = nrrd.read(glob.glob(os.path.join(test_seg_root, f"{case}*.nrrd"))[0])[0]

    if mask_vol.shape[-1] != num_slices:
        print(f"case {case}: mask shape {mask_vol.shape} while num slices {num_slices}")
        continue

    for i in range(num_slices):
        mask = np.transpose(mask_vol[:, :, i])

        if np.max(mask) > 0:
            dcm_file_name = glob.glob(os.path.join(test_raw_root, case_path, f"*{i+1}.dcm"))[0]
            image = dicom.dcmread(dcm_file_name).pixel_array
            shutil.copy(dcm_file_name, os.path.join(root, "images", f"{case}_{i+1}.dcm"))

            mask = Image.fromarray(np.uint8(((mask >= 1)).astype(np.int8) * 255), "L")
            mask.save(os.path.join(root, "masks", f"{case}_{i+1}.png"))

            img_ids.append(f"{case}_{i+1}")

df_test = pd.DataFrame(img_ids)
df_test.to_csv(os.path.join(root, "test.txt"), header=None, index=False, sep=" ")