In [1]:
from lxml import etree as ET
import zipfile
import os
import random
from collections import defaultdict
import pandas as pd

## Extract xml examples (with or without Microbes)

In [2]:
file_path = os.path.join(os.getcwd(), "downloads", "hmdb_metabolites.zip")
file_name = "hmdb_metabolites.xml"

In [3]:
decomp_xml = os.path.join("downloads", "hmdb_metabolites.xml")
if not os.path.exists(decomp_xml):
    with zipfile.ZipFile(file_path, "r") as zip_f:
        zip_f.extract(file_name, path="downloads")

In [3]:
def extract_record_by_accession(xml_path, target_accession, output_path, tag="{http://www.hmdb.ca}metabolite"):
    ns = {'hmdb': 'http://www.hmdb.ca'}
    context = ET.iterparse(xml_path, events=("end",), tag=tag)

    for event, elem in context:
        accession = elem.findtext("hmdb:accession", namespaces=ns)
        if accession == target_accession:
            with open(output_path, "wb") as out:
                out.write(
                    ET.tostring(elem,
                                pretty_print=True,
                                xml_declaration=True,
                                encoding="UTF-8")
                )
            print(f"Saved: {output_path}")
            break

        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

In [4]:
example_dir = os.path.join(os.getcwd(), "examples")
os.makedirs(example_dir, exist_ok=True)

xml_path = os.path.join("downloads", "hmdb_metabolites.xml")

In [None]:
target_accession = "HMDB0000011"
output_path = os.path.join(example_dir, "HMDB0000011.xml")
extract_record_by_accession(xml_path, target_accession, output_path)

In [7]:
target_accession = "HMDB0004327"
output_path = os.path.join(example_dir, "HMDB0004327.xml")
extract_record_by_accession(xml_path, target_accession, output_path)

Saved: /Users/bailinzhang/Documents/Wu_Lab/Projects/HMDB_v5/examples/HMDB0004327.xml


In [5]:
target_accession = "HMDB0000328"
output_path = os.path.join(example_dir, "HMDB0000328.xml")
extract_record_by_accession(xml_path, target_accession, output_path)

Saved: /Users/bailinzhang/Documents/Wu_Lab/Projects/HMDB_v5/examples/HMDB0000328.xml


In [6]:
target_accession = "HMDB0000330"
output_path = os.path.join(example_dir, "HMDB0000330.xml")
extract_record_by_accession(xml_path, target_accession, output_path)

Saved: /Users/bailinzhang/Documents/Wu_Lab/Projects/HMDB_v5/examples/HMDB0000330.xml


In [8]:
def extract_random_metabolites(xml_path, output_path="examples/HMDB_metabolites_examples_10.xml", n=10):
    """Extracts n random <metabolite> records from a large HMDB XML file and writes them
    to an output file with proper XML structure and namespace.
    """
    print("Counting total <metabolite> entries...")
    total = 0
    for _, _ in ET.iterparse(xml_path, events=("end",), tag="{http://www.hmdb.ca}metabolite"):
        total += 1
    print(f"Total metabolites found: {total}")

    selected_indices = set(random.sample(range(total), n))
    print(f"Randomly selected indices: {sorted(selected_indices)}")

    output_elems = []
    context = ET.iterparse(xml_path, events=("end",), tag="{http://www.hmdb.ca}metabolite")
    current_index = 0
    for event, elem in context:
        if current_index in selected_indices:
            xml_str = ET.tostring(elem, pretty_print=True, encoding="unicode")
            output_elems.append(xml_str)
        current_index += 1
        elem.clear()
    print(f"Writing to output file: {output_path}")

    with open(output_path, "w", encoding="utf-8") as f:
        f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        f.write('<hmdb xmlns="http://www.hmdb.ca">\n')
        for entry in output_elems:
            f.write(entry)
        f.write('</hmdb>\n')

    print(f"Done! Extracted {n} records to '{output_path}'")

In [9]:
# extract_random_metabolites(xml_path, n=10)

Counting total <metabolite> entries...
Total metabolites found: 217920
Randomly selected indices: [41240, 91090, 103714, 105509, 108429, 142392, 172312, 193310, 209608, 216669]
Writing to output file: examples/HMDB_metabolites_examples_10.xml
Done! Extracted 10 records to 'examples/HMDB_metabolites_examples_10.xml'


## Explore root tag and children

In [10]:
def get_metabolite_tags(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    all_tags = set()
    for metabolite in root.findall("{http://www.hmdb.ca}metabolite"):
        for child in metabolite:
            tag = child.tag.split("}")[-1]
            all_tags.add(tag)
    return all_tags

In [11]:
exp_path = os.path.join("examples", "HMDB_metabolites_examples_10.xml")
first_child_tags = get_metabolite_tags(exp_path)
first_child_tags

{'abnormal_concentrations',
 'accession',
 'average_molecular_weight',
 'bigg_id',
 'biocyc_id',
 'biological_properties',
 'cas_registry_number',
 'chebi_id',
 'chemical_formula',
 'chemspider_id',
 'creation_date',
 'description',
 'diseases',
 'drugbank_id',
 'experimental_properties',
 'fbonto_id',
 'foodb_id',
 'general_references',
 'inchi',
 'inchikey',
 'iupac_name',
 'kegg_id',
 'knapsack_id',
 'metlin_id',
 'monisotopic_molecular_weight',
 'name',
 'normal_concentrations',
 'ontology',
 'pdb_id',
 'phenol_explorer_compound_id',
 'predicted_properties',
 'protein_associations',
 'pubchem_compound_id',
 'secondary_accessions',
 'smiles',
 'spectra',
 'state',
 'status',
 'synonyms',
 'synthesis_reference',
 'taxonomy',
 'traditional_iupac',
 'update_date',
 'version',
 'vmh_id',
 'wikipedia_id'}

### Ontology tag and tag text

In [12]:
def strip_tag_namespace(tag: str) -> str:
    idx = tag.rfind("}")
    return tag[idx + 1 :] if idx != -1 else tag

In [13]:
def recurse_print_tags(element, level=0):
    indent = "  " * level
    tag = strip_tag_namespace(element.tag)
    text = element.text.strip() if element.text and element.text.strip() else ""
    print(f"{indent}{tag}: {text}")
    for child in element:
        recurse_print_tags(child, level + 1)

In [14]:
def extract_children(xml_path, accession_id=None):
    tree = ET.parse(xml_path)
    rec_root = tree.getroot()
    root_tag = strip_tag_namespace(rec_root.tag)

    if root_tag == "hmdb":
        metabolites = [elem for elem in rec_root if strip_tag_namespace(elem.tag) == "metabolite"]
    elif root_tag == "metabolite":
        metabolites = [rec_root]
    else:
        print(f"Unknown root tag: {root_tag}")
        return

    for metabolite in metabolites:
        if accession_id:
            acc = metabolite.xpath("*[local-name()='accession']")
            if not acc or acc[0].text != accession_id:
                continue

        ontology = metabolite.xpath("*[local-name()='ontology']")
        if ontology:
            roots = ontology[0].xpath("*[local-name()='root']")
            print(f"Found {len(roots)} root nodes in ontology.")
            for root_tag in roots:
                term = root_tag.xpath("*[local-name()='term']")
                if term and term[0].text == "Disposition":
                    print("=== Disposition Block ===")
                    recurse_print_tags(root_tag)
                    return

    print("Disposition not found.")

In [15]:
hmdb0004327_exp = os.path.join("examples", "HMDB0004327.xml")
extract_children(hmdb0004327_exp, accession_id="HMDB0004327")

Found 3 root nodes in ontology.
=== Disposition Block ===
root: 
  term: Disposition
  definition: A concept that describes the origin of a chemical, its location within an organism, or its route of exposure.
  parent_id: 
  level: 1
  type: parent
  descendants: 
    descendant: 
      term: Route of exposure
      definition: A mean by which a chemical agent comes in contact with an organism, either under intended or unintended circumstances.
      parent_id: 7724
      level: 2
      type: parent
      synonyms: 
      descendants: 
        descendant: 
          term: Enteral
          definition: Chemical exposure via the alimentary canal (mouth to anus).
          parent_id: 7743
          level: 3
          type: parent
          synonyms: 
          descendants: 
            descendant: 
              term: Ingestion
              definition: Chemical exposure facilitated by entry through the mouth.
              parent_id: 7744
              level: 4
              type: child


## Protein HMDB xml

In [7]:
file_path = os.path.join(os.getcwd(), "downloads", "hmdb_proteins.zip")
file_name = "hmdb_proteins.xml"

In [8]:
decomp_xml = os.path.join("downloads", "hmdb_proteins.xml")
if not os.path.exists(decomp_xml):
    with zipfile.ZipFile(file_path, "r") as zip_f:
        zip_f.extract(file_name, path="downloads")

In [9]:
example_dir = os.path.join(os.getcwd(), "examples")
os.makedirs(example_dir, exist_ok=True)

xml_path = os.path.join("downloads", "hmdb_proteins.xml")

In [None]:
target_accession = "HMDBP00001"
output_path = os.path.join(example_dir, "HMDBP00001.xml")
extract_record_by_accession(xml_path, target_accession, output_path, tag="{http://www.hmdb.ca}protein")

In [11]:
target_accession = "HMDBP08448"
output_path = os.path.join(example_dir, "HMDBP08448.xml")
extract_record_by_accession(xml_path, target_accession, output_path, tag="{http://www.hmdb.ca}protein")

Saved: /Users/bailinzhang/Documents/Wu_Lab/Projects/HMDB_v5/examples/HMDBP08448.xml
