In [None]:
import xml.etree.ElementTree as ET



In [2]:
# Load the XML file
tree = ET.parse('uniprot.xml')
root = tree.getroot()

In [3]:
root

<Element '{http://www.w3.org/2001/XMLSchema}schema' at 0x00000151CCEB3650>

In [4]:

# Define the XML namespace
namespace = {'xs': "http://www.w3.org/2001/XMLSchema"}

# Extract all elements defined in the schema
elements = root.findall(".//xs:element", namespace)

# Store results
element_info = []

# Loop through the elements
for elem in elements:
    name = elem.get("name")  # Element name
    elem_type = elem.get("type")  # Element type (if available)
    
    # Extract documentation (if present)
    doc = elem.find("./xs:annotation/xs:documentation", namespace)
    documentation = doc.text.strip() if doc is not None else "No documentation"

    # Append the result
    element_info.append({
        "name": name,
        "type": elem_type,
        "documentation": documentation
    })


In [5]:
# Print results
for info in element_info:
    print(f"Element Name: {info['name']}")
    print(f"Type: {info['type']}")
    print(f"Documentation: {info['documentation']}")
    print("---")


Element Name: uniprot
Type: None
Documentation: Describes a collection of UniProtKB entries.
---
Element Name: None
Type: None
Documentation: No documentation
---
Element Name: None
Type: None
Documentation: No documentation
---
Element Name: entry
Type: None
Documentation: Describes a UniProtKB entry.
---
Element Name: accession
Type: xs:string
Documentation: No documentation
---
Element Name: name
Type: xs:string
Documentation: No documentation
---
Element Name: protein
Type: proteinType
Documentation: No documentation
---
Element Name: gene
Type: geneType
Documentation: No documentation
---
Element Name: organism
Type: organismType
Documentation: No documentation
---
Element Name: organismHost
Type: organismType
Documentation: No documentation
---
Element Name: geneLocation
Type: geneLocationType
Documentation: No documentation
---
Element Name: reference
Type: referenceType
Documentation: No documentation
---
Element Name: comment
Type: commentType
Documentation: No documentation
-

In [7]:
pip install lxml

Collecting lxml
  Downloading lxml-5.3.0-cp312-cp312-win_amd64.whl.metadata (3.9 kB)
Downloading lxml-5.3.0-cp312-cp312-win_amd64.whl (3.8 MB)
   ---------------------------------------- 0.0/3.8 MB ? eta -:--:--
   --------------------- ------------------ 2.1/3.8 MB 23.5 MB/s eta 0:00:01
   ---------------------------------------- 3.8/3.8 MB 10.8 MB/s eta 0:00:00
Installing collected packages: lxml
Successfully installed lxml-5.3.0
Note: you may need to restart the kernel to use updated packages.


In [11]:
from lxml import etree
import json

In [12]:

# Path to the XSD file
file_path = "uniprot.xml"

# Parse the XML file
tree = etree.parse(file_path)
root = tree.getroot()

# # Namespace for XSD
# namespace = {"xs": "http://www.w3.org/2001/XMLSchema"}

# # Extract all elements defined in the schema
# elements = root.xpath(".//xs:element", namespaces=namespace)

# # Store results
# element_info = []

# # Loop through the elements
# for elem in elements:
#     name = elem.get("name")  # Element name
#     elem_type = elem.get("type")  # Element type (if available)

#     # Extract documentation (if present)
#     doc = elem.xpath("./xs:annotation/xs:documentation", namespaces=namespace)
#     documentation = doc[0].text.strip() if doc else "No documentation"

#     # Append the result
#     element_info.append({
#         "name": name,
#         "type": elem_type,
#         "documentation": documentation
#     })

# # Print results
# for info in element_info:
#     print(f"Element Name: {info['name']}")
#     print(f"Type: {info['type']}")
#     print(f"Documentation: {info['documentation']}")
#     print("---")

# Namespace for XSD
namespace = {"xs": "http://www.w3.org/2001/XMLSchema"}

# Store results
schema_elements = []

# Recursive function to extract element details
def extract_element_details(element, parent_path=""):
    element_name = element.get("name")
    element_type = element.get("type", "None")
    
    # Build full path for better context
    full_path = f"{parent_path}/{element_name}" if element_name else parent_path

    # Extract documentation if available
    doc = element.xpath("./xs:annotation/xs:documentation", namespaces=namespace)
    documentation = doc[0].text.strip() if doc else "No documentation"

    # Extract attributes
    attributes = []
    for attr in element.xpath("./xs:complexType/xs:attribute", namespaces=namespace):
        attributes.append({
            "name": attr.get("name"),
            "type": attr.get("type", "None"),
            "use": attr.get("use", "optional")  # Check if it's required or optional
        })

    # Add element details to schema_elements
    schema_elements.append({
        "path": full_path,
        "name": element_name,
        "type": element_type,
        "documentation": documentation,
        "attributes": attributes
    })

    # Recursively process child elements
    for child in element.xpath(".//xs:element", namespaces=namespace):
        extract_element_details(child, parent_path=full_path)


# Start extracting from the root level
for top_level_element in root.xpath(".//xs:element", namespaces=namespace):
    extract_element_details(top_level_element)

# Print results
for element in schema_elements:
    print(f"Path: {element['path']}")
    print(f"Element Name: {element['name']}")
    print(f"Type: {element['type']}")
    print(f"Documentation: {element['documentation']}")
    if element["attributes"]:
        print("Attributes:")
        for attr in element["attributes"]:
            print(f" - Name: {attr['name']}, Type: {attr['type']}, Use: {attr['use']}")
    print("---")


Path: /uniprot
Element Name: uniprot
Type: None
Documentation: Describes a collection of UniProtKB entries.
---
Path: /uniprot
Element Name: None
Type: None
Documentation: No documentation
---
Path: /uniprot
Element Name: None
Type: None
Documentation: No documentation
---
Path: 
Element Name: None
Type: None
Documentation: No documentation
---
Path: 
Element Name: None
Type: None
Documentation: No documentation
---
Path: /entry
Element Name: entry
Type: None
Documentation: Describes a UniProtKB entry.
Attributes:
 - Name: dataset, Type: None, Use: required
 - Name: created, Type: xs:date, Use: required
 - Name: modified, Type: xs:date, Use: required
 - Name: version, Type: xs:int, Use: required
---
Path: /entry/accession
Element Name: accession
Type: xs:string
Documentation: No documentation
---
Path: /entry/name
Element Name: name
Type: xs:string
Documentation: No documentation
---
Path: /entry/protein
Element Name: protein
Type: proteinType
Documentation: No documentation
---
Path: 

In [13]:
# Save results to a JSON file
output_file = "schema_details.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(schema_elements, f, indent=4, ensure_ascii=False)
print(f"Schema details saved to {output_file}")

Schema details saved to schema_details.json
