In [None]:
import os
from tqdm import tqdm
import pydicom
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed

DICOM Archive Path Containing all CTH and CTP Maps - Note CPT Raw files are not inculded here

In [3]:
dir_path = r"D:\CTH_archive\DICOM"

Search for all Series Descriptions

In [5]:
def process_file(file_path):
    try:
        dicom_file = pydicom.dcmread(file_path, stop_before_pixels=True)
        return dicom_file.SeriesDescription
    except Exception:
        return None

def collect_unique_series_descriptions(dir_path):
    unique_series_descriptions = set()
    file_paths = []  # List to store all file paths

    # Traverse through all files in the directory and store the paths
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            # Construct the full file path
            file_path = os.path.join(root, file)
            file_paths.append(file_path)

    # Create a ThreadPoolExecutor to parallelize the task
    with ThreadPoolExecutor() as executor:
        # Schedule the processing of all the files and create a future list
        futures = [executor.submit(process_file, file_path) for file_path in file_paths]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing files"):
            result = future.result()
            if result is not None:
                unique_series_descriptions.add(result)
    
    for series_description in unique_series_descriptions:
        print(series_description)

collect_unique_series_descriptions(dir_path)

Processing files: 100%|██████████| 310342/310342 [07:55<00:00, 652.38it/s] 


VPCT  DynMulti4D  1.5  Hr35 26
COR SOFT
VIZ CTP AT BOLUS TIME
CTP                                                        PN:11
CTP  1.5  Hr38 24
Head  0.6  MPR  ax
CTP                                                        PN:06
CT PERFUSION                                               PN:30
SAG SOFT
Bone
AX BONE
Patient Protocol
CTP, iDose (1)
CTP                                                        PN:08
CTP                                                        PN:13
VPCT   PERFUSION  1.5  Hr35 16
VPCT  DynMulti4D  1.5  Hr35 20
CTP  5.0  Hr38
VPCT  DynMulti4D  1.5  Hr35 13
CTP                                                        PN:14
VPCT   PERFUSION  1.5  Hr35 25
PDF_REPORT_OLEA_ANALYSIS
VIZ SUMMARY
VPCT   PERFUSION  5.0  Hr35
CT PERFUSION                                               PN:24
VPCT  DynMulti4D  1.5  Hr35 21
Viz MTT
AP, LAT HEAD SCOUT STROKE
CTP  1.5  Hr38 7
CTP  1.5  Hr38 1
VPCT  DynMulti4D  1.5  Hr35 12
DOC SCAN
ISOTROPIC
VIZ CBF
LVO Back to PACS
VPCT   PERFUSI

Search for all standard axial CT heads in dataset

In [None]:
def process_file(file_path, series_descriptions_to_match):
    try:
        # Read the file as a DICOM file without loading pixel data
        dicom_file = pydicom.dcmread(file_path, stop_before_pixels=True)
        series_description = dicom_file.SeriesDescription.lower()
        patient_id = dicom_file.PatientID
        # Check if the series description matches any in the list
        if series_description in series_descriptions_to_match:
            return patient_id, series_description, file_path
    except:
        pass
    return None, None, None

def report_issues(patient_series_map):
    for patient_id, series_data in patient_series_map.items():
        series_set = {sd[0] for sd in series_data}  # Extract series descriptions
        print(f"Patient {patient_id} has matching series: {series_set} and {len(series_set)} matching files.")
        if len(series_set) == 0:
            print(f"No matching series found for patient {patient_id}")
        elif len(series_set) > 1:
            print(f"More than one type of matching series found for patient {patient_id}: {series_set}")

def collect_matching_series(dir_path, series_descriptions_to_match):
    patient_series_map = {}
    series_descriptions_to_match = [desc.lower() for desc in series_descriptions_to_match]  # Normalize to lowercase
    file_paths = []

    # Collect all file paths
    for root, dirs, files in os.walk(dir_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path)

    # Use ThreadPoolExecutor to parallelize the task
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_file, file_path, series_descriptions_to_match) for file_path in file_paths]
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing files"):
            patient_id, series_description, file_path = future.result()
            if patient_id and series_description:
                if patient_id not in patient_series_map:
                    patient_series_map[patient_id] = set()
                patient_series_map[patient_id].add((series_description, file_path))

    # Report any issues found during validation
    report_issues(patient_series_map)
    
    # Extract and return the matching file paths
    matching_file_paths = [fp for series_data in patient_series_map.values() for _, fp in series_data]
    return matching_file_paths

series_descriptions_to_match = [
    "routine head",
    "head  5.0  j45s  1",
    "head  5.0  mpr  ax",
    " axial head 5.0",
    "axial",
    "axial (stroke alert)",
    'axial 1.2x1.2'
    "ax5st.",
    'ax soft',
    'axial stnd',
    'head  5.0  mpr  ax',
    'soft'
]

matching_files = collect_matching_series(dir_path, series_descriptions_to_match)
print(f"Found {len(matching_files)} matching files.")

In [10]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import pydicom
import os

def process_file(file_path, series_descriptions_to_match):
    try:
        # Read the file as a DICOM file without loading pixel data
        dicom_file = pydicom.dcmread(file_path, stop_before_pixels=True)
        series_description = dicom_file.SeriesDescription.lower()
        patient_id = dicom_file.PatientID
        # Check if the series description matches any in the list
        if series_description in series_descriptions_to_match:
            return patient_id, file_path  # Only return the patient ID and file path if there's a match
    except:
        pass
    return None, None

def report_issues(patient_series_map):
    unique_patient_ids_with_matches = set()

    for patient_id, series_data in patient_series_map.items():
        if series_data:  # If there are any matching series for the patient
            unique_patient_ids_with_matches.add(patient_id)

    print(f"Number of unique patient IDs with matching series: {len(unique_patient_ids_with_matches)}")

def collect_matching_series(dir_path, series_descriptions_to_match):
    patient_series_map = {}
    series_descriptions_to_match = [desc.lower() for desc in series_descriptions_to_match]  # Normalize to lowercase

    # Collect all file paths and process them
    with ThreadPoolExecutor() as executor:
        futures = {executor.submit(process_file, os.path.join(root, file), series_descriptions_to_match): file for root, dirs, files in os.walk(dir_path) for file in files}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing files"):
            patient_id, file_path = future.result()
            if patient_id:
                if patient_id not in patient_series_map:
                    patient_series_map[patient_id] = set()
                patient_series_map[patient_id].add(file_path)

    # Report the number of unique patient IDs with matching series
    report_issues(patient_series_map)
    
    # Extract and return the matching file paths
    matching_file_paths = [fp for series_data in patient_series_map.values() for fp in series_data]
    return matching_file_paths

series_descriptions_to_match = [
"ctp                                                        pn:01",
"ct perfusion                                               pn:01",
'vpct   perfusion  1.5  hr35 1',
"ctp  1.5  hr38 1",
"vpct  dynmulti4d  1.5  hr35 1"
]

matching_files = collect_matching_series(dir_path, series_descriptions_to_match)
print(f"Found {len(matching_files)} matching files.")


Processing files: 100%|██████████| 310342/310342 [08:08<00:00, 634.81it/s] 


Number of unique patient IDs with matching series: 70
Found 4101 matching files.


Copy all CTH standard axial studies to their own directories

In [None]:
# Define the source and destination directory paths
dst_dir_path = r"D:\CTH_archive\CTP_DICOM"

# Copy the matching files to the destination directory
for src_file_path in matching_files:
    # Construct the destination file path
    relative_path = os.path.relpath(src_file_path, dir_path)
    dst_file_path = os.path.join(dst_dir_path, relative_path)
    
    # Create the destination directory if it doesn't exist
    os.makedirs(os.path.dirname(dst_file_path), exist_ok=True)
    
    # Copy the file
    shutil.copy2(src_file_path, dst_file_path)
    

Serach Series Descriptions for a Single Patient

In [None]:
import os
import pydicom

def list_dicom_series_descriptions(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.dcm'):
                try:
                    filepath = os.path.join(root, file)
                    dicom_file = pydicom.dcmread(filepath)
                    if hasattr(dicom_file, 'SeriesDescription'):
                        print(f"File: {filepath} - Series Description: {dicom_file.SeriesDescription}")
                except Exception as e:
                    print(f"Could not read {file}: {e}")

directory = "D:\\CTH_archive\\DICOM\\PLAZA_RAUL A100582\\"
list_dicom_series_descriptions(directory)


In [None]:
import os
import pydicom

def print_dicom_metadata(directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.dcm'):
                try:
                    filepath = os.path.join(root, file)
                    dicom_file = pydicom.dcmread(filepath)
                    print(f"Metadata for {filepath}:")
                    print(dicom_file)
                except Exception as e:
                    print(f"Could not read {file}: {e}")
                print('-' * 80)  # Just a separator for readability

directory = "D:\\CTH_archive\\DICOM\\LASUSA_PETER_ROBERT E710770\\"
print_dicom_metadata(directory)
