In [1]:
import os
from tqdm import tqdm
import pydicom
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed

DICOM Archive Path Containing all CTH and CTP Maps - Note CPT Raw files are not inculded here

In [2]:
dir_path = r"D:\CTH_archive\DICOM"

Search for all Series Descritions

In [None]:
def process_file(file_path):
    try:
        dicom_file = pydicom.dcmread(file_path, stop_before_pixels=True)
        return dicom_file.SeriesDescription
    except Exception:
        return None

def collect_unique_series_descriptions(dir_path):
    unique_series_descriptions = set()
    file_paths = []  # List to store all file paths

    # Traverse through all files in the directory and store the paths
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            # Construct the full file path
            file_path = os.path.join(root, file)
            file_paths.append(file_path)

    # Create a ThreadPoolExecutor to parallelize the task
    with ThreadPoolExecutor() as executor:
        # Schedule the processing of all the files and create a future list
        futures = [executor.submit(process_file, file_path) for file_path in file_paths]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing files"):
            result = future.result()
            if result is not None:
                unique_series_descriptions.add(result)
    
    for series_description in unique_series_descriptions:
        print(series_description)

collect_unique_series_descriptions(dir_path)

In [None]:
matching_files = []
for root, dirs, files in tqdm(os.walk(dir_path)):
    for file in files:
        # Construct the full file path
        file_path = os.path.join(root, file)
        try:
            # Try to read the file as a DICOM file
            dicom_file = pydicom.dcmread(file_path) 
            # Check the SeriesDescription
            if dicom_file.SeriesDescription.lower() == "routine head":
                matching_files.append(file_path)
        except:      
            pass

Search for all standard axial CT heads in dataset

In [5]:
def process_file(file_path, series_descriptions_to_match):
    try:
        # Read the file as a DICOM file without loading pixel data
        dicom_file = pydicom.dcmread(file_path, stop_before_pixels=True)
        series_description = dicom_file.SeriesDescription.lower()
        patient_id = dicom_file.PatientID
        # Check if the series description matches any in the list
        if series_description in series_descriptions_to_match:
            return patient_id, series_description
    except:
        pass
    return None, None

def report_issues(patient_series_map):
    for patient_id, series_set in patient_series_map.items():
        if len(series_set) == 0:
            print(f"No matching series found for patient {patient_id}")
        elif len(series_set) > 1:
            print(f"More than one type of matching series found for patient {patient_id}: {series_set}")

def collect_matching_series(dir_path, series_descriptions_to_match):
    patient_series_map = {}
    series_descriptions_to_match = [desc.lower() for desc in series_descriptions_to_match]  # Normalize to lowercase
    file_paths = []

    # Collect all file paths
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)

    # Use ThreadPoolExecutor to parallelize the task
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_file, file_path, series_descriptions_to_match) for file_path in file_paths]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing files"):
            patient_id, series_description = future.result()
            if patient_id and series_description:
                if patient_id not in patient_series_map:
                    patient_series_map[patient_id] = set()
                patient_series_map[patient_id].add(series_description)

    # Report any issues found during validation
    report_issues(patient_series_map)

series_descriptions_to_match = [
    "routine head",
    "head  5.0  j45s  1",
    "head  5.0  mpr  ax",
    "axial head 5.0",
    "axial"
]

collect_matching_series(dir_path, series_descriptions_to_match)

Copy all CTH standard axial studies to their own directories

In [None]:
# Define the source and destination directory paths
dst_dir_path = r"D:\CTH_archive\CTH_DICOM"

# Copy the matching files to the destination directory
for src_file_path in matching_files:
    # Construct the destination file path
    relative_path = os.path.relpath(src_file_path, dir_path)
    dst_file_path = os.path.join(dst_dir_path, relative_path)
    
    # Create the destination directory if it doesn't exist
    os.makedirs(os.path.dirname(dst_file_path), exist_ok=True)
    
    # Copy the file
    shutil.copy2(src_file_path, dst_file_path)