In [1]:
import os
import SimpleITK as sitk
from rt_utils import RTStructBuilder
import numpy as np
from tqdm import tqdm

In [2]:
# Function to read the DICOM series into a SimpleITK image
def read_dicom_image_to_sitk(img_path):
    reader = sitk.ImageSeriesReader()
    dicom_names = reader.GetGDCMSeriesFileNames(str(img_path))
    reader.SetFileNames(dicom_names)
    image = reader.Execute()
    return image

# Function to extract prostate contour and calculate number of slices with prostate
def extract_prostate_contour(rtstruct_path, dicom_path, ROI_name='Prostate'):
    # Load RT Struct from file and extract prostate mask
    rtstruct = RTStructBuilder.create_from(dicom_series_path=dicom_path, rt_struct_path=rtstruct_path)
    mask_3d = rtstruct.get_roi_mask_by_name(ROI_name)
    return mask_3d



In [3]:
# Function to calculate the number of patients, slices, and prostate slices
def calculate_patient_statistics(dataset_path, contour_names=['contouring', 'structure sets'], image_names=['pelvis', 'prostate', 'images']):
    patients = {}
    total_patients = 0
    total_slices_per_patient = {}
    prostate_slices_per_patient = {}

    # Iterate through all first-level folders (representing patients)
    folders = [name for name in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, name))]

    for index, folder in tqdm(enumerate(folders), total=len(folders)):
        first_level_folder_path = os.path.join(dataset_path, folder)

        # Find second-level folders within each first-level folder
        second_level_folders = [name for name in os.listdir(first_level_folder_path) if os.path.isdir(os.path.join(first_level_folder_path, name))]

        for subfolder in second_level_folders:
            second_level_folder_path = os.path.join(first_level_folder_path, subfolder)
            
            # Initialize paths for contour and image data
            contour_path = None
            image_path = None

            # Search for contour and image files in the second-level folder
            for file_name in os.listdir(second_level_folder_path):
                file_path = os.path.join(second_level_folder_path, file_name)

                if os.path.isdir(file_path) and any(name in file_name.lower() for name in contour_names):
                    for contour_file in os.listdir(file_path):
                        contour_path = os.path.join(file_path, contour_file)
                        contour_path = os.path.normpath(contour_path)
                        break

                elif any(name in file_name.lower() for name in image_names):
                    image_path = os.path.normpath(file_path)
            
            # Process if both contour and image paths are found
            if contour_path and image_path:
                try:
                    # Load DICOM image and RT Struct to extract prostate mask
                    mask_3d = extract_prostate_contour(contour_path, image_path)

                    # Count number of slices in the CT image
                    image_data = read_dicom_image_to_sitk(image_path)
                    total_slices = image_data.GetSize()[2]  # Number of slices (Z dimension)

                    # Count slices with prostate
                    prostate_slices = 0
                    for slice_index in range(total_slices):
                        slice_mask = mask_3d[:, :, slice_index]  # Get the 2D slice mask
                        if np.any(slice_mask > 0):  # Prostate is present in the slice
                            prostate_slices += 1

                    # Store the results
                    total_patients += 1
                    total_slices_per_patient[folder] = total_slices
                    prostate_slices_per_patient[folder] = prostate_slices

                except Exception as e:
                    print(f"Error processing folder {second_level_folder_path}: {e}")
    
    return total_patients, total_slices_per_patient, prostate_slices_per_patient

In [4]:
# Example usage
dataset_path = '../Dataset'  # Path to the dataset directory
total_patients, total_slices, prostate_slices = calculate_patient_statistics(dataset_path)

# Output results
print(f"Total number of patients: {total_patients}")
for patient in total_slices:
    print(f"Patient {patient}: {total_slices[patient]} slices, {prostate_slices[patient]} slices with prostate")

 55%|█████▍    | 69/126 [04:51<03:01,  3.18s/it]

Error processing folder ../Dataset\Prostate-AEC-072\01-20-2004-NA-RX SIMULATION-39006: 'Dataset' object has no attribute 'ContourSequence'


 73%|███████▎  | 92/126 [06:13<01:24,  2.48s/it]

Error processing folder ../Dataset\Prostate-AEC-100\02-08-1994-NA-RX SIMULATION-49255: Loaded RTStruct references image(s) that are not contained in input series data. Problematic image has SOP Instance Id: 1.3.6.1.4.1.14519.5.2.1.318018974687115193529751995200758167181


 75%|███████▍  | 94/126 [06:19<01:25,  2.68s/it]

Error processing folder ../Dataset\Prostate-AEC-102\09-22-1992-NA-RX SIMULATION-12336: Loaded RTStruct references image(s) that are not contained in input series data. Problematic image has SOP Instance Id: 1.3.6.1.4.1.14519.5.2.1.99974205625178499890576907774251751574


100%|██████████| 126/126 [08:50<00:00,  4.21s/it]

Total number of patients: 123
Patient Prostate-AEC-001: 179 slices, 24 slices with prostate
Patient Prostate-AEC-002: 105 slices, 13 slices with prostate
Patient Prostate-AEC-003: 163 slices, 24 slices with prostate
Patient Prostate-AEC-004: 179 slices, 31 slices with prostate
Patient Prostate-AEC-005: 166 slices, 25 slices with prostate
Patient Prostate-AEC-006: 161 slices, 29 slices with prostate
Patient Prostate-AEC-007: 193 slices, 43 slices with prostate
Patient Prostate-AEC-008: 178 slices, 34 slices with prostate
Patient Prostate-AEC-009: 189 slices, 26 slices with prostate
Patient Prostate-AEC-010: 159 slices, 21 slices with prostate
Patient Prostate-AEC-011: 150 slices, 24 slices with prostate
Patient Prostate-AEC-012: 158 slices, 23 slices with prostate
Patient Prostate-AEC-013: 171 slices, 23 slices with prostate
Patient Prostate-AEC-014: 162 slices, 40 slices with prostate
Patient Prostate-AEC-015: 167 slices, 34 slices with prostate
Patient Prostate-AEC-016: 149 slices, 24




In [5]:
import os
import pandas as pd

def save_results_to_excel(total_patients, total_slices, prostate_slices, output_file):
    # Create the directory for the file if it doesn't exist
    output_dir = os.path.dirname(output_file)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Create a DataFrame from the results
    data = {
        'Patient ID': list(total_slices.keys()),
        'Total Slices': list(total_slices.values()),
        'Slices with Prostate': list(prostate_slices.values())
    }
    df = pd.DataFrame(data)
    
    # Save the DataFrame to an Excel file
    df.to_excel(output_file, index=False)
    print(f"Results saved to {output_file}")

In [7]:
# Example usage
dataset_path = '../Dataset'  # Path to the dataset directory
output_excel_file = '../patient_statistics.xlsx'  # Output Excel file path

total_patients, total_slices, prostate_slices = calculate_patient_statistics(dataset_path)

# Save the results to an Excel file
save_results_to_excel(total_patients, total_slices, prostate_slices, output_excel_file)

  0%|          | 0/126 [00:00<?, ?it/s]

 55%|█████▍    | 69/126 [04:16<02:09,  2.28s/it]

Error processing folder ../Dataset\Prostate-AEC-072\01-20-2004-NA-RX SIMULATION-39006: 'Dataset' object has no attribute 'ContourSequence'


 73%|███████▎  | 92/126 [05:32<01:15,  2.22s/it]

Error processing folder ../Dataset\Prostate-AEC-100\02-08-1994-NA-RX SIMULATION-49255: Loaded RTStruct references image(s) that are not contained in input series data. Problematic image has SOP Instance Id: 1.3.6.1.4.1.14519.5.2.1.318018974687115193529751995200758167181


 75%|███████▍  | 94/126 [05:40<01:41,  3.16s/it]

Error processing folder ../Dataset\Prostate-AEC-102\09-22-1992-NA-RX SIMULATION-12336: Loaded RTStruct references image(s) that are not contained in input series data. Problematic image has SOP Instance Id: 1.3.6.1.4.1.14519.5.2.1.99974205625178499890576907774251751574


100%|██████████| 126/126 [07:46<00:00,  3.70s/it]


Results saved to ../patient_statistics.xlsx
