# Investigating DICOM meta data attributes

In [1]:
from utils.common_imports import *
import re
import ast
from glob import glob

In [53]:
dicom_image = pydicom.dcmread("/Users/newuser/Documents/ITU/master_thesis/data/lung_data/manifest-1725363397135/LIDC-IDRI/LIDC-IDRI-0001/01-01-2000-NA-NA-30178/3000566.000000-NA-03192/1-057.dcm")
# parse_dicom_to_dict(dicom_image)

# dicom_image.to_json_dict() # another function that is available in pydicom. Does not return proper keys though
dicom_image

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 206
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: CT Image Storage
(0002, 0003) Media Storage SOP Instance UID      UI: 1.3.6.1.4.1.14519.5.2.1.6279.6001.313544823773855097029348077255
(0002, 0010) Transfer Syntax UID                 UI: Explicit VR Little Endian
(0002, 0012) Implementation Class UID            UI: 1.3.6.1.4.1.22213.1.143
(0002, 0013) Implementation Version Name         SH: '0.5'
(0002, 0016) Source Application Entity Title     AE: 'POSDA'
-------------------------------------------------
(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'AXIAL']
(0008, 0016) SOP Class UID                       UI: CT Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.3.6.1.4.1.14519.5.2.1.6279.6001.31354

In [73]:
# a FileDataset object is a dictionary-like object
# see class documentation: https://pydicom.github.io/pydicom/stable/reference/generated/pydicom.dataset.FileDataset.html#pydicom.dataset.FileDataset

def get_dicom_vals(dicom_file: pydicom.dataset.FileDataset):
    vals = [v for _, v in dicom_file.to_json_dict().items()]
    return_vals = []
    for v in vals:
        if isinstance(v, dict):
            if "Value" in v:
                if len(v["Value"]) == 1:
                    return_vals.append(v["Value"][0])
                else:
                    return_vals.append(v["Value"])
            else:
                return_vals.append(None)
        else:
            raise ValueError(f"Value is not a dict: {type(v)}")

    return return_vals

def get_dicom_keys(dicom_file: pydicom.dataset.FileDataset):
    return_keys = []
    extract_key = lambda k: re.sub(r'\s+', ' ', k.split(")")[1][1:].split(":")[0])
    for _, val in dicom_image.items():
        k = extract_key(str(val))
        return_keys.append(k)
    return return_keys

def parse_dicom_to_dict(dicom_file: pydicom.dataset.FileDataset):
    return dict(zip(get_dicom_keys(dicom_file), get_dicom_vals(dicom_file)))

parse_dicom_to_dict(dicom_image)

{'Specific Character Set CS': 'ISO_IR 100',
 'Image Type CS': ['ORIGINAL', 'PRIMARY', 'AXIAL'],
 'SOP Class UID UI': '1.2.840.10008.5.1.4.1.1.2',
 'SOP Instance UID UI': '1.3.6.1.4.1.14519.5.2.1.6279.6001.313544823773855097029348077255',
 'Study Date DA': '20000101',
 'Series Date DA': '20000101',
 'Acquisition Date DA': '20000101',
 'Content Date DA': '20000101',
 'Overlay Date DA': '20000101',
 'Curve Date DA': '20000101',
 'Acquisition DateTime DT': '20000101',
 'Study Time TM': None,
 'Acquisition Time TM': None,
 'Content Time TM': None,
 'Accession Number SH': None,
 'Modality CS': 'CT',
 'Manufacturer LO': 'GE MEDICAL SYSTEMS',
 "Referring Physician's Name PN": None,
 "Manufacturer's Model Name LO": 'LightSpeed Plus',
 'Referenced SOP Instance UID UI': '1.3.6.1.4.1.14519.5.2.1.6279.6001.258745335770133974769679057210',
 "Patient's Name PN": None,
 'Patient ID LO': 'LIDC-IDRI-0001',
 "Patient's Birth Date DA": None,
 "Patient's Sex CS": None,
 "Patient's Age AS": None,
 'Last Men

In [35]:
# a FileDataset object is a dictionary-like object
# see class documentation: https://pydicom.github.io/pydicom/stable/reference/generated/pydicom.dataset.FileDataset.html#pydicom.dataset.FileDataset


def extract_meta_key_val_pair(val: str) -> str:
    key = re.sub(r'\s+', ' ', val.split(")")[1][1:].split(":")[0])
    val = val.split(":")[1].strip()
    return key, val


def parse_dicom_to_dict_OLD(dicom_image: pydicom.dataset.FileDataset) -> dict:
    dicom_meta = {}
    for _, val in dicom_image.items():
        k, v = extract_meta_key_val_pair(str(val))

        if v == '':
            dicom_meta[k] = None
        elif v.isdigit():
            dicom_meta[k] = float(v)
        # Check if the item is a list (e.g., "['ORIGINAL', 'PRIMARY', 'AXIAL']")
        elif v.startswith('[') and v.endswith(']'):
            dicom_meta[k] = (ast.literal_eval(v))  # Safe evaluation of list strings
        else:
            dicom_meta[k] = v

    return dicom_meta

dicom_image = pydicom.dcmread("/Users/newuser/Documents/ITU/master_thesis/data/lung_data/manifest-1725363397135/LIDC-IDRI/LIDC-IDRI-0001/01-01-2000-NA-NA-30178/3000566.000000-NA-03192/1-057.dcm")
# parse_dicom_to_dict_OLD(dicom_image)

dicom_image.to_json_dict() # another function that is available in pydicom. Does not return proper keys though

{'00080005': {'vr': 'CS', 'Value': ['ISO_IR 100']},
 '00080008': {'vr': 'CS', 'Value': ['ORIGINAL', 'PRIMARY', 'AXIAL']},
 '00080016': {'vr': 'UI', 'Value': ['1.2.840.10008.5.1.4.1.1.2']},
 '00080018': {'vr': 'UI',
  'Value': ['1.3.6.1.4.1.14519.5.2.1.6279.6001.313544823773855097029348077255']},
 '00080020': {'vr': 'DA', 'Value': ['20000101']},
 '00080021': {'vr': 'DA', 'Value': ['20000101']},
 '00080022': {'vr': 'DA', 'Value': ['20000101']},
 '00080023': {'vr': 'DA', 'Value': ['20000101']},
 '00080024': {'vr': 'DA', 'Value': ['20000101']},
 '00080025': {'vr': 'DA', 'Value': ['20000101']},
 '0008002A': {'vr': 'DT', 'Value': ['20000101']},
 '00080030': {'vr': 'TM'},
 '00080032': {'vr': 'TM'},
 '00080033': {'vr': 'TM'},
 '00080050': {'vr': 'SH'},
 '00080060': {'vr': 'CS', 'Value': ['CT']},
 '00080070': {'vr': 'LO', 'Value': ['GE MEDICAL SYSTEMS']},
 '00080090': {'vr': 'PN'},
 '00081090': {'vr': 'LO', 'Value': ['LightSpeed Plus']},
 '00081155': {'vr': 'UI',
  'Value': ['1.3.6.1.4.1.14519.

In [74]:
# Load all dicom files in a patient scan directory
# and check if the meta keys are consistent across all files

def collect_meta_fields(patient_scan_dir: str) -> dict:
    patient_scan_paths = sorted(glob(f"{patient_scan_dir}/*.dcm"))
    collected_meta_fields = {}
    for s in patient_scan_paths:
        with pydicom.dcmread(fp=s, force=True) as dicom_image:
            # NOTE: the print statement forces the image to be read into memory. Do not remove.
            print(dicom_image)
            dict = parse_dicom_to_dict(dicom_image)

        for k, v in dict.items():
            if k not in collected_meta_fields:
                collected_meta_fields[k] = []
            collected_meta_fields[k].append(v)

    return collected_meta_fields

patient_scan_dir = "/Users/newuser/Documents/ITU/master_thesis/data/lung_data/manifest-1725363397135/LIDC-IDRI/LIDC-IDRI-0001/01-01-2000-NA-NA-30178/3000566.000000-NA-03192"
collected_meta_fields = collect_meta_fields(patient_scan_dir)

res_keys = list(collected_meta_fields.keys())

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 206
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: CT Image Storage
(0002, 0003) Media Storage SOP Instance UID      UI: 1.3.6.1.4.1.14519.5.2.1.6279.6001.262721256650280657946440242654
(0002, 0010) Transfer Syntax UID                 UI: Explicit VR Little Endian
(0002, 0012) Implementation Class UID            UI: 1.3.6.1.4.1.22213.1.143
(0002, 0013) Implementation Version Name         SH: '0.5'
(0002, 0016) Source Application Entity Title     AE: 'POSDA'
-------------------------------------------------
(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'AXIAL']
(0008, 0016) SOP Class UID                       UI: CT Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.3.6.1.4.1.14519.5.2.1.6279.6001.26272

In [75]:
# check that all meta data for all dicom images in the scan are equal
def all_equal(lst: list) -> bool:
    first_element = lst[0]
    return all([first_element == x for x in lst])

for i in range(0, len(collected_meta_fields)):
    print(f"{res_keys[i]} ({i}) : {all_equal(collected_meta_fields[res_keys[i]])}")

Specific Character Set CS (0) : True
Image Type CS (1) : True
SOP Class UID UI (2) : True
SOP Instance UID UI (3) : False
Study Date DA (4) : True
Series Date DA (5) : True
Acquisition Date DA (6) : True
Content Date DA (7) : True
Overlay Date DA (8) : True
Curve Date DA (9) : True
Acquisition DateTime DT (10) : True
Study Time TM (11) : True
Acquisition Time TM (12) : True
Content Time TM (13) : True
Accession Number SH (14) : True
Modality CS (15) : True
Manufacturer LO (16) : True
Referring Physician's Name PN (17) : True
Manufacturer's Model Name LO (18) : True
Referenced SOP Instance UID UI (19) : False
Patient's Name PN (20) : True
Patient ID LO (21) : True
Patient's Birth Date DA (22) : True
Patient's Sex CS (23) : True
Patient's Age AS (24) : True
Last Menstrual Date DA (25) : True
Patient Identity Removed CS (26) : True
De-identification Method LO (27) : True
Private Creator LO (28) : True
Private tag data LO (29) : True
Contrast/Bolus Agent LO (30) : True
Body Part Examined C

In [None]:
# make .csv file with all the meta data checking that they are equal across all patient scans
