## DICOM Standard harvesting of Part 3 
### Extract relationships between tags and CIDs

[Useful Online XML Viewer](https://jsonformatter.org/xml-viewer)

Links to the XML Objects
- [DICOM Part 3](https://dicom.nema.org/medical/dicom/current/source/docbook/part03/part03.xml)
- [DICOM Part 6](https://dicom.nema.org/medical/dicom/current/source/docbook/part06/part06.xml)
- [DICOM Part 16](https://dicom.nema.org/medical/dicom/current/source/docbook/part16/part16.xml)


### Extract one IOD Module Table: A.36 Enhanced MR

In [7]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Fetch XML content from the URL
url = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part03/part03.xml'
response = requests.get(url)
xml_content = response.content

# Parse the XML content
soup = BeautifulSoup(xml_content, 'xml')

# Find the table with label 'A.36-1'
table = soup.find('table', {'label': 'A.36-1'})

# Find the section containing the table and extract its title
section_title = soup.find('section', {'xml:id': table.parent['xml:id']}).find('title').text.strip()

# Extract the part of the title before "IOD"
iod_index = section_title.find(" IOD")
iod = section_title[:iod_index]

# Extract table headers
headers = [th.text.strip() for th in table.find_all('th')]

# Extract table rows
rows = []
current_ie = None
for tr in table.find_all('tr')[1:]:  # Skip the header row
    cells = tr.find_all(['td', 'th'])

    # Extract cell values
    cell_values = [cell.text.strip() for cell in cells]

    # Check if the cell values are complete (some cells may be missing due to rowspan)
    if len(cell_values) < len(headers):
        # Fill in the missing values based on the previous row
        for i in range(len(headers) - len(cell_values)):
            cell_values.insert(0, current_ie)

    # Store the IE value for the next iteration
    current_ie = cell_values[0]

    # Extract the reference value if available
    reference = None
    for cell in cells:
        xref = cell.find('xref', {'xrefstyle': 'select: labelnumber'})
        if xref:
            reference = xref['linkend']
            break

    # Insert the reference value into the correct position
    if reference:
        # Find the index of the cell containing the reference
        reference_index = next((idx for idx, val in enumerate(cell_values) if val == ''), None)
        if reference_index is not None:
            cell_values[reference_index] = reference

    # Insert the title of the table as the first element
    cell_values.insert(0, iod)

    rows.append(cell_values)

# Create a DataFrame
df = pd.DataFrame(rows, columns=['IOD'] + headers)


In [81]:
df

Unnamed: 0,IOD,IE,Module,Reference,Usage
0,Enhanced MR Image,Patient,Patient,sect_C.7.1.1,M
1,Enhanced MR Image,Patient,Clinical Trial Subject,sect_C.7.1.3,U
2,Enhanced MR Image,Study,General Study,sect_C.7.2.1,M
3,Enhanced MR Image,Study,Patient Study,sect_C.7.2.2,U
4,Enhanced MR Image,Study,Clinical Trial Study,sect_C.7.2.3,U
5,Enhanced MR Image,Series,General Series,sect_C.7.3.1,M
6,Enhanced MR Image,Series,Clinical Trial Series,sect_C.7.3.2,U
7,Enhanced MR Image,Series,MR Series,sect_C.8.13.6,M
8,Enhanced MR Image,Frame of Reference,Frame of Reference,sect_C.7.4.1,M
9,Enhanced MR Image,Frame of Reference,Synchronization,sect_C.7.4.2,C - Required if time synchronization was applied.


### Extract Module Tables from all IODs

In [110]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

def extract_module_table_data(section):
    # Search for the "Module Table" within subsections
    for sub_section in section.find_all('section', recursive=True):
        module_table = sub_section.find('table', {'caption': re.compile('IOD Modules')})
        if module_table:
            # Extract data from the "Module Table"
            table_data = []
            current_ie = None
            for tr in module_table.find_all('tr')[1:]:  # Skip the header row
                cells = tr.find_all(['td', 'th'])

                # Extract cell values
                cell_values = [cell.text.strip() for cell in cells]

                # Check if the cell values are complete (some cells may be missing due to rowspan)
                if len(cell_values) < 4:
                    # Fill in the missing values based on the previous row
                    for i in range(4 - len(cell_values)):
                        cell_values.insert(0, current_ie)

                # Store the IE value for the next iteration
                current_ie = cell_values[0]

                # Extract the reference value if available
                reference = None
                for cell in cells:
                    xref = cell.find('xref', {'xrefstyle': 'select: labelnumber'})
                    if xref:
                        reference = xref['linkend']
                        break

                # Insert the reference value into the correct position
                if reference:
                    # Find the index of the cell containing the reference
                    reference_index = next((idx for idx, val in enumerate(cell_values) if val == ''), None)
                    if reference_index is not None:
                        cell_values[reference_index] = reference

                table_data.append(cell_values)

            return table_data

    return None

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Fetch XML content from the URL
url = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part03/part03.xml'
response = requests.get(url)
xml_content = response.content

# Parse the XML content
soup = BeautifulSoup(xml_content, 'xml')

# List to store rows of data
rows = []

# Iterate over sections in the range A.2 to A.90
for section_label in range(2, 91):
    section_id = f"sect_A.{section_label}"
    section = soup.find('section', {'xml:id': section_id})
    if section:
        # Extract data from the "Module Table" section
        module_table_data = extract_module_table_data(section)
        if module_table_data:
            # Append the extracted data to the rows list
            rows.extend([[section.find('title').text.strip()] + row for row in module_table_data])

In [112]:
# Create a DataFrame from the list of rows
df = pd.DataFrame(rows, columns=['IOD', 'IE', 'Module', 'Reference', 'Usage'])

# Print the DataFrame
print(df)

[]

### Extract Section C

In [9]:
from bs4 import BeautifulSoup

# Fetch XML content from the URL
url = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part03/part03.xml'
response = requests.get(url)
xml_content = response.content

# Parse the XML content
soup = BeautifulSoup(xml_content, 'xml')

In [90]:
# Find the mapping table between reference module from IOD table to reference table
rows = []

df_reference = df[df['Usage']=="M"]

for section_id in df_reference['Reference']:
    section_element = soup.find('section', {'xml:id': section_id})
    
    if section_element:
        tables = section_element.findChildren('table', recursive=False)
        # For each table, create a dictionary with section_id and table XML id and add it to the rows list
        for table in tables:
            if table.has_attr('xml:id'):
                row = {'section_id': section_id, 'table_xml_id': table['xml:id']}
                rows.append(row)
# Create a DataFrame from the rows list
table_df = pd.DataFrame(rows)
table_df

Unnamed: 0,section_id,table_xml_id
0,sect_C.7.1.1,table_C.7-1
1,sect_C.7.2.1,table_C.7-3
2,sect_C.7.3.1,table_C.7-5a
3,sect_C.8.13.6,table_C.8-101
4,sect_C.7.4.1,table_C.7-6
5,sect_C.7.5.1,table_C.7-8
6,sect_C.7.5.2,table_C.7-8b
7,sect_C.7.6.3,table_C.7-11a
8,sect_C.7.6.16,table_C.7.6.16-1
9,sect_C.7.6.17,table_C.7.6.17-1


In [78]:
import pandas as pd
from bs4 import BeautifulSoup

# Find the table
table = soup.find('table', {'xml:id': 'table_C.7-1'})
table_xml_id = table.parent['xml:id']

# Initialize lists to store extracted data
attribute_names = []
tags = []
types = []
attribute_descriptions = []
cid = []

# Extract data from the table rows
for row in table.find_all('tr'):
    columns = row.find_all('td')
    if len(columns) == 4:
        # Extract attribute name, tag, type, and description
        attribute_name = columns[0].text.strip()
        
        # Skip rows with "attribute name" starting with ">"
        if attribute_name.startswith('>'):
            continue
        # Extract attribute name, tag, type, and description
        tag = columns[1].text.strip()
        type_ = columns[2].text.strip()
        description = columns[3].find_all('variablelist')

        # Extract defined terms from each variablelist
        defined_terms_dict = {}
        if description:
            for variablelist in description:
                title = variablelist.title.text.strip()
                defined_terms = [term.text.strip() for term in variablelist.find_all('term')]
                defined_terms_dict[title] = defined_terms
            
        # Append the extracted data to the lists
        attribute_names.append(attribute_name)
        tags.append(tag)
        types.append(type_)
        attribute_descriptions.append(defined_terms_dict)
        cid.append('')

    elif len(columns) == 2:
        # Insert the value in the second column into the previous row's Attribute Description
        olink = columns[1].find('olink')
        if olink:
            targetptr = olink.get('targetptr', '')
            if 'CID_' in targetptr:
                cid_value = targetptr.split('CID_')[-1]
                cid[-1] += cid_value

# Create a DataFrame from the extracted data
data = {
    'section_id': table_xml_id,
    'Attribute Name': attribute_names,
    'Tag': tags,
    'Type': types,
    'Attribute Description': attribute_descriptions,
    'CID': cid
}
df_test = pd.DataFrame(data)



In [79]:
df_test

Unnamed: 0,section_id,Attribute Name,Tag,Type,Attribute Description,CID
0,sect_C.7.1.1,Patient's Name,"(0010,0010)",2,{},
1,sect_C.7.1.1,Patient ID,"(0010,0020)",2,{},
2,sect_C.7.1.1,Type of Patient ID,"(0010,0022)",3,"{'Defined Terms:': ['TEXT', 'RFID', 'BARCODE']}",
3,sect_C.7.1.1,Patient's Birth Date,"(0010,0030)",2,{},
4,sect_C.7.1.1,Patient's Birth Date in Alternative Calendar,"(0010,0033)",3,{},
5,sect_C.7.1.1,Patient's Death Date in Alternative Calendar,"(0010,0034)",3,{},
6,sect_C.7.1.1,Patient's Alternative Calendar,"(0010,0035)",1C,{},
7,sect_C.7.1.1,Patient's Sex,"(0010,0040)",2,"{'Enumerated Values:': ['M', 'F', 'O']}",
8,sect_C.7.1.1,Referenced Patient Photo Sequence,"(0010,1100)",3,{},
9,sect_C.7.1.1,Quality Control Subject,"(0010,0200)",3,"{'Enumerated Values:': ['YES', 'NO']}",


In [97]:
import pandas as pd
from bs4 import BeautifulSoup

def process_table(soup, table_xml_id):
    # Find the table using table_xml_id
    table = soup.find('table', {'xml:id': table_xml_id})
    table_xml_id = table.parent['xml:id']

    # Initialize lists to store extracted data
    attribute_names = []
    tags = []
    types = []
    attribute_descriptions = []
    cid = []

    # Extract data from the table rows
    for row in table.find_all('tr'):
        columns = row.find_all('td')
        if len(columns) == 4:
            attribute_name = columns[0].text.strip()
            if attribute_name.startswith('>'):
                continue
            tag = columns[1].text.strip()
            type_ = columns[2].text.strip()
            description = columns[3].find_all('variablelist')

            defined_terms_dict = {}
            if description:
                for variablelist in description:
                    title = variablelist.title.text.strip()
                    defined_terms = [term.text.strip() for term in variablelist.find_all('term')]
                    defined_terms_dict[title] = defined_terms
            
            attribute_names.append(attribute_name)
            tags.append(tag)
            types.append(type_)
            attribute_descriptions.append(defined_terms_dict)
            cid.append('')

        elif len(columns) == 2:
            first_col = columns[0].text.strip()
            if first_col.startswith('>>'):
                continue
            olink = columns[1].find('olink')
            if olink:
                targetptr = olink.get('targetptr', '')
                if 'CID_' in targetptr:
                    cid_value = targetptr.split('CID_')[-1]
                    cid[-1] += cid_value

    # Create a DataFrame from the extracted data
    data = {
        'section_id': table_xml_id,
        'Attribute Name': attribute_names,
        'Tag': tags,
        'Type': types,
        'Attribute Description': attribute_descriptions,
        'CID': cid
    }
    return pd.DataFrame(data)

In [98]:
reference_table_data = []
for table_xml_id in table_df['table_xml_id']:
    df = process_table(soup, table_xml_id)
    reference_table_data.append(df)

# combine all DataFrames into one
reference_tables = pd.concat(reference_table_data, ignore_index=True)

In [113]:
# subset of tags based on type
reference_tables[reference_tables['Type']=='1']

Unnamed: 0,section_id,Attribute Name,Tag,Type,Attribute Description,CID
33,sect_C.7.2.1,Study Instance UID,"(0020,000D)",1,{},
53,sect_C.7.3.1,Modality,"(0008,0060)",1,{},
54,sect_C.7.3.1,Series Instance UID,"(0020,000E)",1,{},
74,sect_C.8.13.6,Modality,"(0008,0060)",1,{'Enumerated Values:': ['MR']},
76,sect_C.7.4.1,Frame of Reference UID,"(0020,0052)",1,{},
97,sect_C.7.5.2,Manufacturer,"(0008,0070)",1,{},
98,sect_C.7.5.2,Manufacturer's Model Name,"(0008,1090)",1,{},
99,sect_C.7.5.2,Device Serial Number,"(0018,1000)",1,{},
100,sect_C.7.5.2,Software Versions,"(0018,1020)",1,{},
106,sect_C.7.6.16,Shared Functional Groups Sequence,"(5200,9229)",1,{},


In [114]:
# subset of tags with attribute description
reference_tables[reference_tables['Attribute Description']!= {}]

Unnamed: 0,section_id,Attribute Name,Tag,Type,Attribute Description,CID
2,sect_C.7.1.1,Type of Patient ID,"(0010,0022)",3,"{'Defined Terms:': ['TEXT', 'RFID', 'BARCODE']}",
7,sect_C.7.1.1,Patient's Sex,"(0010,0040)",2,"{'Enumerated Values:': ['M', 'F', 'O']}",
9,sect_C.7.1.1,Quality Control Subject,"(0010,0200)",3,"{'Enumerated Values:': ['YES', 'NO']}",
30,sect_C.7.1.1,Patient Identity Removed,"(0012,0062)",3,"{'Enumerated Values:': ['YES', 'NO']}",
55,sect_C.7.3.1,Laterality,"(0020,0060)",2C,"{'Enumerated Values:': ['R', 'L']}",
72,sect_C.7.3.1,Anatomical Orientation Type,"(0010,2210)",1C,"{'Enumerated Values:': ['BIPED', 'QUADRUPED']}",
74,sect_C.8.13.6,Modality,"(0008,0060)",1,{'Enumerated Values:': ['MR']},
112,sect_C.7.6.16,Stereo Pairs Present,"(0022,0028)",3,"{'Enumerated Values:': ['YES', 'NO']}",
121,sect_C.7.6.17,Dimension Organization Type,"(0020,9311)",3,"{'Defined Terms:': ['3D', '3D_TEMPORAL', 'TILE...",
133,sect_C.8.13.1,Burned In Annotation,"(0028,0301)",1C,{'Enumerated Values:': ['NO']},


In [115]:
reference_tables[reference_tables['CID']!='']

Unnamed: 0,section_id,Attribute Name,Tag,Type,Attribute Description,CID
17,sect_C.7.1.1,Patient Species Code Sequence,"(0010,2202)",1C,{},7454
19,sect_C.7.1.1,Patient Breed Code Sequence,"(0010,2293)",2C,{},7480
32,sect_C.7.1.1,De-identification Method Code Sequence,"(0012,0064)",1C,{},7050
49,sect_C.7.2.1,Requesting Service Code Sequence,"(0032,1034)",3,{},7030
51,sect_C.7.2.1,Procedure Code Sequence,"(0008,1032)",3,{},101
83,sect_C.7.5.1,Institutional Department Type Code Sequence,"(0008,1041)",3,{},7030
