# Investigating DICOM meta data attributes

In [1]:
from utils.common_imports import *
from tqdm import tqdm
import re
from glob import glob
from utils.utils import get_scan_directory_path_by_patient_id

In [None]:
dicom_image = pydicom.dcmread("/Users/newuser/Documents/ITU/master_thesis/data/lung_data/manifest-1725363397135/LIDC-IDRI/LIDC-IDRI-0001/01-01-2000-NA-NA-30178/3000566.000000-NA-03192/1-057.dcm")
dicom_image

In [3]:
# a FileDataset object is a dictionary-like object
# see class documentation: https://pydicom.github.io/pydicom/stable/reference/generated/pydicom.dataset.FileDataset.html#pydicom.dataset.FileDataset

def get_dicom_vals(dicom_file: pydicom.dataset.FileDataset):
    """Returns the values of the dicom image in a clean format"""
    vals = [v for _, v in dicom_file.to_json_dict().items()]
    return_vals = []
    for v in vals:
        if isinstance(v, dict):
            if "Value" in v:
                if len(v["Value"]) == 1:
                    return_vals.append(v["Value"][0])
                else:
                    return_vals.append(v["Value"])
            else:
                return_vals.append(None)
        else:
            raise ValueError(f"Value is not a dict: {type(v)}")

    return return_vals

def get_dicom_keys(dicom_file: pydicom.dataset.FileDataset):
    """Returns the keys of the dicom image in a clean format"""
    return_keys = []
    extract_key = lambda k: re.sub(r'\s+', ' ', k.split(")")[1][1:].split(":")[0])
    for _, val in dicom_image.items():
        k = extract_key(str(val))
        return_keys.append(k)
    return return_keys

def parse_dicom_to_dict(dicom_file: pydicom.dataset.FileDataset):
    return dict(zip(get_dicom_keys(dicom_file), get_dicom_vals(dicom_file)))

In [22]:
# Load all dicom files in a patient scan directory
# and check if the meta keys are consistent across all files
def collect_meta_fields(patient_scan_dir: str) -> dict:
    patient_scan_paths = sorted(glob(f"{patient_scan_dir}/*.dcm"))
    collected_meta_fields = {}
    for s in patient_scan_paths:
        with pydicom.dcmread(fp=s, force=True) as dicom_image:
            # NOTE: the print statement (or pixel_array access) forces the image to be read into memory. Do not remove.
            # print(dicom_image)
            try:
                dicom_image.pixel_array
            except Exception as e:
                print(dicom_image)
            dict = parse_dicom_to_dict(dicom_image)

        for k, v in dict.items():
            if k not in collected_meta_fields:
                collected_meta_fields[k] = []
            collected_meta_fields[k].append(v)

    return collected_meta_fields

patient_scan_dir = "/Users/newuser/Documents/ITU/master_thesis/data/lung_data/manifest-1725363397135/LIDC-IDRI/LIDC-IDRI-0001/01-01-2000-NA-NA-30178/3000566.000000-NA-03192"
collected_meta_fields = collect_meta_fields(patient_scan_dir)

res_keys = list(collected_meta_fields.keys())

In [23]:
# check that all meta data for all dicom images in the scan are equal
def all_equal(lst: list) -> bool:
    """Checks if all elements in the list are equal (to the first element)"""
    first_element = lst[0]
    return all([first_element == x for x in lst])

def all_equal_dict(d: dict) -> list[bool]:
    """Assumes a dictionary with list values is given. Checks if all lists are equal"""
    return [all_equal(v) for _, v in d.items()]

In [None]:
# # make .csv file with all the meta data checking that they are equal across all patient scans
all_patient_ids = sorted([d for d in os.listdir(config.DATA_DIR) if "LIDC-IDRI" in d])

df_dicts = []

# c = 0
for pid in tqdm(all_patient_ids):
    pid_scan_file_dir = get_scan_directory_path_by_patient_id(patient_id_dir=pid)
    collected_meta_fields = collect_meta_fields(pid_scan_file_dir)
    equal_results = all_equal_dict(collected_meta_fields)
    df_dicts.append({k: v for k, v in zip(res_keys, equal_results)})

    # DEBUGGING
    # if c == 10:
    #     break
    # c += 1

In [28]:
df = pd.DataFrame(df_dicts, index=all_patient_ids)
# df.to_excel("out/meta_data_check.xlsx")