## DICOM Standard Part 6

In [1]:
import requests
import xml.etree.ElementTree as ET

# URI for DICOM Standard Part 6
xml_uri = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part06/part06.xml'

# Parse the XML content
response = requests.get(xml_uri)
root = ET.fromstring(response.content)



In [2]:
# Find the child node with label="6"
for child in root:
    if child.attrib.get('label') == '6':
        selected_node = child
        break

In [3]:
import pandas as pd
from collections import defaultdict

# Get table 1 from Chapter 6
if selected_node is not None:
    grandchild_table = selected_node.find('.//{http://docbook.org/ns/docbook}table')
    if grandchild_table is not None:
        thead = grandchild_table.find('.//{http://docbook.org/ns/docbook}thead')
        tbody = grandchild_table.find('.//{http://docbook.org/ns/docbook}tbody')

        # Store column names
        column_names = []
        if thead is not None:
            for tr in thead.findall('.//{http://docbook.org/ns/docbook}tr'):
                thead_names = tr.findall('.//{http://docbook.org/ns/docbook}emphasis')
                if thead_names is not None:
                    for name in thead_names:
                        column_names.append(name.text.strip())

            # Store row values
            rows = tbody.findall('.//{http://docbook.org/ns/docbook}tr')
            rows_data = []

            # Loop through tbody to extract values
            for tr in rows:
                row_values = defaultdict(lambda: None)
                idx = 0
                for para in tr.findall('.//{http://docbook.org/ns/docbook}para'):
                    emphasis = para.find('.//{http://docbook.org/ns/docbook}emphasis')
                    if emphasis is not None and emphasis.text is not None:
                        row_values[column_names[idx]] = emphasis.text.strip()
                        idx += 1
                        if idx >= len(column_names):
                            break
                    else:
                        if para is not None and para.text is not None:
                            row_values[column_names[idx]] = para.text.strip()
                            idx += 1
                            if idx >= len(column_names):
                                break

                # Append to the table only if it has more than 3 values
                if sum(1 for value in row_values.values() if value is not None) > 3:
                    rows_data.append(row_values)

            # Save the output as a DataFrame
            df = pd.DataFrame(rows_data, columns=column_names)
else:
    print("Node with label='6' not found.")


In [4]:
df.head(5)

Unnamed: 0,Tag,Name,Keyword,VR,VM
0,"(0008,0001)",Length to End,Length​To​End,UL,1
1,"(0008,0005)",Specific Character Set,Specific​Character​Set,CS,1-n
2,"(0008,0006)",Language Code Sequence,Language​Code​Sequence,SQ,1
3,"(0008,0008)",Image Type,Image​Type,CS,2-n
4,"(0008,0010)",Recognition Code,Recognition​Code,SH,1


In [5]:
included_VR = ['AT', 'CS', 'DA', 'DT', 'DS', 'FL', 'FD', 'IS', 'SL', 'SS', 'SV', 'TM', 'UL', 'US', 'UV']
df[df['VR'].isin(included_VR)].VR.value_counts() #CS is values; others are numbers 

VR
CS    799
DS    483
US    379
FD    309
FL    292
IS    239
UL     79
DT     64
DA     61
TM     55
AT     24
SS     18
SL     15
UV      6
SV      1
Name: count, dtype: int64

In [16]:
df.to_csv('./files/part6_attributes.csv', index=False)

In [8]:
# output attributes with CS value represenatitive
cs_attributes = df[df['VR']=="CS"]
cs_attributes.to_csv("./files/part6_CS_attributes.csv", index = False)