## DICOM Standard harvesting of Part 3 
### Extract relationships between tags and CIDs

[Useful Online XML Viewer](https://jsonformatter.org/xml-viewer)

Links to the XML Objects
- [DICOM Part 3](https://dicom.nema.org/medical/dicom/current/source/docbook/part03/part03.xml)
- [DICOM Part 6](https://dicom.nema.org/medical/dicom/current/source/docbook/part06/part06.xml)
- [DICOM Part 16](https://dicom.nema.org/medical/dicom/current/source/docbook/part16/part16.xml)


### Extract one IOD Module Table: A.36 Enhanced MR

In [6]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Fetch XML content from the URL
url = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part03/part03.xml'
response = requests.get(url)
xml_content = response.content

# Parse the XML content
soup = BeautifulSoup(xml_content, 'xml')

# Find the table with label 'A.36-1'
table = soup.find('table', {'label': 'A.36-1'})

# Find the section containing the table and extract its title
section_title = soup.find('section', {'xml:id': table.parent['xml:id']}).find('title').text.strip()

# Extract the part of the title before "IOD"
iod_index = section_title.find(" IOD")
iod = section_title[:iod_index]

# Extract table headers
headers = [th.text.strip() for th in table.find_all('th')]

# Extract table rows
rows = []
current_ie = None
for tr in table.find_all('tr')[1:]:  # Skip the header row
    cells = tr.find_all(['td', 'th'])

    # Extract cell values
    cell_values = [cell.text.strip() for cell in cells]

    # Check if the cell values are complete (some cells may be missing due to rowspan)
    if len(cell_values) < len(headers):
        # Fill in the missing values based on the previous row
        for i in range(len(headers) - len(cell_values)):
            cell_values.insert(0, current_ie)

    # Store the IE value for the next iteration
    current_ie = cell_values[0]

    # Extract the reference value if available
    reference = None
    for cell in cells:
        xref = cell.find('xref', {'xrefstyle': 'select: labelnumber'})
        if xref:
            reference = xref['linkend']
            break

    # Insert the reference value into the correct position
    if reference:
        # Find the index of the cell containing the reference
        reference_index = next((idx for idx, val in enumerate(cell_values) if val == ''), None)
        if reference_index is not None:
            cell_values[reference_index] = reference

    # Insert the title of the table as the first element
    cell_values.insert(0, iod)

    rows.append(cell_values)

# Create a DataFrame
df = pd.DataFrame(rows, columns=['IOD'] + headers)


In [7]:
df

Unnamed: 0,IOD,IE,Module,Reference,Usage
0,Enhanced MR Image,Patient,Patient,sect_C.7.1.1,M
1,Enhanced MR Image,Patient,Clinical Trial Subject,sect_C.7.1.3,U
2,Enhanced MR Image,Study,General Study,sect_C.7.2.1,M
3,Enhanced MR Image,Study,Patient Study,sect_C.7.2.2,U
4,Enhanced MR Image,Study,Clinical Trial Study,sect_C.7.2.3,U
5,Enhanced MR Image,Series,General Series,sect_C.7.3.1,M
6,Enhanced MR Image,Series,Clinical Trial Series,sect_C.7.3.2,U
7,Enhanced MR Image,Series,MR Series,sect_C.8.13.6,M
8,Enhanced MR Image,Frame of Reference,Frame of Reference,sect_C.7.4.1,M
9,Enhanced MR Image,Frame of Reference,Synchronization,sect_C.7.4.2,C - Required if time synchronization was applied.


### Extract Module Tables from all IODs

In [110]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

def extract_module_table_data(section):
    # Search for the "Module Table" within subsections
    for sub_section in section.find_all('section', recursive=True):
        module_table = sub_section.find('table', {'caption': re.compile('IOD Modules')})
        if module_table:
            # Extract data from the "Module Table"
            table_data = []
            current_ie = None
            for tr in module_table.find_all('tr')[1:]:  # Skip the header row
                cells = tr.find_all(['td', 'th'])

                # Extract cell values
                cell_values = [cell.text.strip() for cell in cells]

                # Check if the cell values are complete (some cells may be missing due to rowspan)
                if len(cell_values) < 4:
                    # Fill in the missing values based on the previous row
                    for i in range(4 - len(cell_values)):
                        cell_values.insert(0, current_ie)

                # Store the IE value for the next iteration
                current_ie = cell_values[0]

                # Extract the reference value if available
                reference = None
                for cell in cells:
                    xref = cell.find('xref', {'xrefstyle': 'select: labelnumber'})
                    if xref:
                        reference = xref['linkend']
                        break

                # Insert the reference value into the correct position
                if reference:
                    # Find the index of the cell containing the reference
                    reference_index = next((idx for idx, val in enumerate(cell_values) if val == ''), None)
                    if reference_index is not None:
                        cell_values[reference_index] = reference

                table_data.append(cell_values)

            return table_data

    return None

In [111]:
# Fetch XML content from the URL
url = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part03/part03.xml'
response = requests.get(url)
xml_content = response.content

# Parse the XML content
soup = BeautifulSoup(xml_content, 'xml')

# List to store rows of data
rows = []

# Iterate over sections in the range A.2 to A.90
for section_label in range(2, 91):
    section_id = f"sect_A.{section_label}"
    section = soup.find('section', {'xml:id': section_id})
    if section:
        # Extract data from the "Module Table" section
        module_table_data = extract_module_table_data(section)
        if module_table_data:
            # Append the extracted data to the rows list
            rows.extend([[section.find('title').text.strip()] + row for row in module_table_data])

In [112]:
# Create a DataFrame from the list of rows
df = pd.DataFrame(rows, columns=['IOD', 'IE', 'Module', 'Reference', 'Usage'])

# Print the DataFrame
print(df)

[]

### Extract Section C

In [137]:
# Fetch XML content from the URL
url = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part03/part03.xml'
response = requests.get(url)
xml_content = response.content

# Parse the XML content
soup = BeautifulSoup(xml_content, 'xml')

In [218]:
import pandas as pd
from bs4 import BeautifulSoup
import re

# Find the table
table = soup.find('table', {'xml:id': 'table_C.7-1'})

# Initialize lists to store extracted data
attribute_names = []
tags = []
types = []
attribute_descriptions = []

# Extract data from the table rows
for row in table.find_all('tr'):
    columns = row.find_all('td')
    if len(columns) == 4:
        # Extract attribute name, tag, type, and description
        attribute_name = columns[0].text.strip()
        
        # Skip rows with "attribute name" starting with ">"
        if attribute_name.startswith('>'):
            continue
        # Extract attribute name, tag, type, and description
        tag = columns[1].text.strip()
        type_ = columns[2].text.strip()
        description = columns[3].text.strip()
        
        # Find the <variablelist> elements with titles "Defined Terms" or "Enumerated Values"
        variablelists = re.findall(r'<variablelist[^>]*?title=["\'](Defined Terms|Enumerated Values)["\'][^>]*?>(.*?)<\/variablelist>', str(description), flags=re.DOTALL)

        # Extract defined terms from each variablelist
        defined_terms_dict = {}
        if variablelists:
            for variablelist in variablelists:
                title = variablelist.title.text.strip()
                defined_terms = [entry.term.text.strip() for entry in variablelist.find_all('varlistentry')]
                defined_terms_dict[title] = defined_terms
            
        # Append the extracted data to the lists
        attribute_names.append(attribute_name)
        tags.append(tag)
        types.append(type_)
        attribute_descriptions.append(description)

    # elif len(columns) == 2:
    #     # Insert the value in the second column into the previous row's Attribute Description
    #     attribute_descriptions[-1] += ' ' + columns[1].text.strip()

# Create a DataFrame from the extracted data
data = {
    'Attribute Name': attribute_names,
    'Tag': tags,
    'Type': types,
    'Attribute Description': attribute_descriptions
}
df_test = pd.DataFrame(data)



In [219]:
df_test

Unnamed: 0,Attribute Name,Tag,Type,Attribute Description
0,Patient's Name,"(0010,0010)",2,Patient's full name.
1,Patient ID,"(0010,0020)",2,Primary identifier for the Patient.\n\nIn the ...
2,Type of Patient ID,"(0010,0022)",3,The type of identifier in the Patient ID (0010...
3,Patient's Birth Date,"(0010,0030)",2,Birth date of the Patient.
4,Patient's Birth Date in Alternative Calendar,"(0010,0033)",3,Date of birth of the named Patient in the Alte...
5,Patient's Death Date in Alternative Calendar,"(0010,0034)",3,Date of death of the named Patient in the Alte...
6,Patient's Alternative Calendar,"(0010,0035)",1C,The Alternative Calendar used for Patient's Bi...
7,Patient's Sex,"(0010,0040)",2,Sex of the named Patient.\n\nEnumerated Values...
8,Referenced Patient Photo Sequence,"(0010,1100)",3,A photo to confirm the identity of a Patient.\...
9,Quality Control Subject,"(0010,0200)",3,Indicates whether or not the subject is a qual...


In [220]:
df_test["Attribute Description"][17]

'The taxonomic rank value (e.g., genus, subgenus, species or subspecies) of the Patient. See .\nOnly a single Item shall be included in this Sequence.\nRequired if the Patient is a non-human organism and if Patient Species Description (0010,2201) is not present. May be present otherwise. D.'

In [209]:
import requests
import xml.etree.ElementTree as ET

# URI for DICOM Standard Part 3
xml_uri = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part03/part03.xml'

# Parse the XML content
response = requests.get(xml_uri)
root = ET.fromstring(response.content)

In [212]:
# Find the child node with label="6"
for child in root:
    if child.attrib.get('label') == 'C':
        selected_node = child
        break

In [210]:
import pandas as pd
import xml.etree.ElementTree as ET
import re

# Find the table with the specified xml:id
table = root.find(".//*[@xml:id='table_C.7-1']")

# Initialize lists to store extracted data
attribute_names = []
tags = []
types = []
attribute_descriptions = []

# Extract data from the table rows
for row in table.findall(".//tr"):
    columns = row.findall("./td")
    if len(columns) == 4:
        # Extract attribute name, tag, type, and description
        attribute_name = columns[0].text.strip()
        
        # Skip rows with "attribute name" starting with ">"
        if attribute_name.startswith('>'):
            continue
        
        # Extract attribute name, tag, type, and description
        tag = columns[1].text.strip()
        type_ = columns[2].text.strip()
        description = columns[3].text.strip()
        
        # Search for variablelist elements within the description
        variablelists = re.findall(r'<variablelist[^>]*?title=["\'](Defined Terms|Enumerated Values)["\'][^>]*?>(.*?)<\/variablelist>', description, flags=re.DOTALL)

        # Extract defined terms from each variablelist
        defined_terms_dict = {}
        if variablelists:
            for title, content in variablelists:
                defined_terms = [entry.strip() for entry in re.findall(r'<term>(.*?)<\/term>', content)]
                defined_terms_dict[title] = defined_terms
            
        # Append the extracted data to the lists
        attribute_names.append(attribute_name)
        tags.append(tag)
        types.append(type_)
        attribute_descriptions.append(defined_terms_dict)

# Create a DataFrame from the extracted data
data = {
    'Attribute Name': attribute_names,
    'Tag': tags,
    'Type': types,
    'Attribute Description': attribute_descriptions
}
df_test = pd.DataFrame(data)


SyntaxError: prefix 'xml' not found in prefix map (<string>)