# DICOM Standard Part 6

## Option 1. Extract HTML

In [None]:
import xml.etree.ElementTree as ET
import requests

# XML URI
xml_uri = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part06.xml'

# Fetch the XML data from the URI
response = requests.get(xml_uri)
if response.status_code == 200:
    # Parse the XML content
    root = ET.fromstring(response.content)

    # Example: Print root tag
    print(root.tag)
else:
    print(f"Failed to retrieve XML, status code: {response.status_code}")


In [29]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page containing the table
url = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part06/part06.xml'

# Fetch the webpage content
response = requests.get(url)
if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Locate the table by its ID
    table = soup.find('table', {'frame': 'box', 'rules': 'all'})

    # Extract table headers
    headers = []
    thead = table.find('thead')
    for th in thead.find_all('th'):
        headers.append(th.get_text(strip=True))

    # Extract table rows
    rows_data = []
    tbody = table.find('tbody')
    for row in tbody.find_all('tr'):
        cells = row.find_all('td')
        row_data = [cell.get_text(strip=True) for cell in cells]
        rows_data.append(row_data)

    # Convert the data into a pandas DataFrame
    df = pd.DataFrame(rows_data, columns=headers)
    
else:
    print(f"Failed to retrieve page, status code: {response.status_code}")


  k = self.parse_starttag(i)


              Tag                         Name                      Keyword  \
0     (0008,0001)                Length to End                Length​To​End   
1     (0008,0005)       Specific Character Set       Specific​Character​Set   
2     (0008,0006)       Language Code Sequence       Language​Code​Sequence   
3     (0008,0008)                   Image Type                   Image​Type   
4     (0008,0010)             Recognition Code             Recognition​Code   
...           ...                          ...                          ...   
5185  (FFFA,FFFA)  Digital Signatures Sequence  Digital​Signatures​Sequence   
5186  (FFFC,FFFC)    Data Set Trailing Padding    Data​Set​Trailing​Padding   
5187  (FFFE,E000)                         Item                         Item   
5188  (FFFE,E00D)       Item Delimitation Item       Item​Delimitation​Item   
5189  (FFFE,E0DD)   Sequence Delimitation Item   Sequence​Delimitation​Item   

            VR   VM       
0           UL    1  RET

In [30]:
df.head(10)

Unnamed: 0,Tag,Name,Keyword,VR,VM,Unnamed: 6
0,"(0008,0001)",Length to End,Length​To​End,UL,1,RET
1,"(0008,0005)",Specific Character Set,Specific​Character​Set,CS,1-n,
2,"(0008,0006)",Language Code Sequence,Language​Code​Sequence,SQ,1,
3,"(0008,0008)",Image Type,Image​Type,CS,2-n,
4,"(0008,0010)",Recognition Code,Recognition​Code,SH,1,RET
5,"(0008,0012)",Instance Creation Date,Instance​Creation​Date,DA,1,
6,"(0008,0013)",Instance Creation Time,Instance​Creation​Time,TM,1,
7,"(0008,0014)",Instance Creator UID,Instance​Creator​UID,UI,1,
8,"(0008,0015)",Instance Coercion DateTime,Instance​Coercion​Date​Time,DT,1,
9,"(0008,0016)",SOP Class UID,SOP​Class​UID,UI,1,


## Option 2. Extract XML file

In [None]:
import requests
import xml.etree.ElementTree as ET

# URI for DICOM Standard Part 6
xml_uri = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part06/part06.xml'

# Parse the XML content
response = requests.get(xml_uri)
root = ET.fromstring(response.content)

# Find the child node with label="6"
for child in root:
    if child.attrib.get('label') == '6':
        selected_node = child
        break

In [None]:
import pandas as pd
from collections import defaultdict
    
# Get table 1 from Chapter 6
if selected_node is not None:
    grandchild_table = selected_node.find('.//{http://docbook.org/ns/docbook}table')
    if grandchild_table is not None:
        thead = grandchild_table.find('.//{http://docbook.org/ns/docbook}thead')
        tbody = grandchild_table.find('.//{http://docbook.org/ns/docbook}tbody')

        # Store column names
        column_names = []
        if thead is not None:
            for tr in thead.findall('.//{http://docbook.org/ns/docbook}tr'):
                thead_names = tr.findall('.//{http://docbook.org/ns/docbook}emphasis')
                if thead_names is not None:
                    for name in thead_names:
                        column_names.append(name.text.strip())

            # Store row values
            rows = tbody.findall('.//{http://docbook.org/ns/docbook}tr')
            rows_data = []

            # Loop through tbody to extract values
            for tr in rows:
                row_values = defaultdict(lambda: None)
                idx = 0
                for para in tr.findall('.//{http://docbook.org/ns/docbook}para'):
                    emphasis = para.find('.//{http://docbook.org/ns/docbook}emphasis')
                    if emphasis is not None and emphasis.text is not None:
                        row_values[column_names[idx]] = emphasis.text.strip()
                        idx += 1
                        if idx >= len(column_names):
                            break
                    else:
                        if para is not None and para.text is not None:
                            row_values[column_names[idx]] = para.text.strip()
                            idx += 1
                            if idx >= len(column_names):
                                break

                # Append to the table only if it has more than 3 values
                if sum(1 for value in row_values.values() if value is not None) > 3:
                    rows_data.append(row_values)

            # Save the output as a DataFrame
            df = pd.DataFrame(rows_data, columns=column_names)
else:
    print("Node with label='6' not found.")


## Explore the extracted table

In [4]:
df.head(5)

Unnamed: 0,Tag,Name,Keyword,VR,VM
0,"(0008,0001)",Length to End,Length​To​End,UL,1
1,"(0008,0005)",Specific Character Set,Specific​Character​Set,CS,1-n
2,"(0008,0006)",Language Code Sequence,Language​Code​Sequence,SQ,1
3,"(0008,0008)",Image Type,Image​Type,CS,2-n
4,"(0008,0010)",Recognition Code,Recognition​Code,SH,1


In [31]:
df.shape #(5085, 5) -> 5190 # increased

(5190, 6)

In [39]:
included_VR = ['AT', 'CS', 'DA', 'DT', 'DS', 'FL', 'FD', 'IS', 'SL', 'SS', 'SV', 'TM', 'UL', 'US', 'UV']
df[df['VR'].isin(included_VR)].VR.value_counts()

VR
CS    820
DS    526
US    379
FD    310
FL    293
IS    239
UL     79
DT     64
DA     61
TM     55
AT     24
SS     18
SL     15
UV      7
SV      1
Name: count, dtype: int64

In [40]:
df[df['VR'].isin(included_VR)].shape

(2891, 7)

In [33]:
df['Tag_cleaned'] = df['Tag'].str.replace(r'[(),]', '', regex = True)
df.head()

Unnamed: 0,Tag,Name,Keyword,VR,VM,Unnamed: 6,Tag_cleaned
0,"(0008,0001)",Length to End,Length​To​End,UL,1,RET,80001
1,"(0008,0005)",Specific Character Set,Specific​Character​Set,CS,1-n,,80005
2,"(0008,0006)",Language Code Sequence,Language​Code​Sequence,SQ,1,,80006
3,"(0008,0008)",Image Type,Image​Type,CS,2-n,,80008
4,"(0008,0010)",Recognition Code,Recognition​Code,SH,1,RET,80010


In [42]:
df[df['Tag_cleaned']=="00080008"]

Unnamed: 0,Tag,Name,Keyword,VR,VM,Unnamed: 6,Tag_cleaned
3,"(0008,0008)",Image Type,Image​Type,CS,2-n,,80008


In [43]:
df['VR'].nunique()

40

In [37]:
df.to_csv('./files/DICOM Standard/part6_attributes.csv', index=False)