In [1]:
from lxml import etree as ET
import zipfile
import os
import random
from collections import defaultdict
import pandas as pd

## Extract xml examples (with or without Microbes)

In [2]:
file_path = os.path.join(os.getcwd(), "downloads", "hmdb_metabolites.zip")
file_name = "hmdb_metabolites.xml"

In [3]:
decomp_xml = os.path.join("temp_dir", "hmdb_metabolites.xml")
if not decomp_xml:
    with zipfile.ZipFile(file_path, "r") as zip_f:
        zip_f.extract(file_name, path="temp_dir")

In [4]:
def extract_record_by_accession(xml_path, target_accession, output_path):
    ns = {'hmdb': 'http://www.hmdb.ca'}
    context = ET.iterparse(xml_path, events=("end",), tag="{http://www.hmdb.ca}metabolite")

    for event, elem in context:
        accession = elem.findtext("hmdb:accession", namespaces=ns)
        if accession == target_accession:
            with open(output_path, "wb") as out:
                out.write(ET.tostring(elem, pretty_print=True, xml_declaration=True, encoding="UTF-8"))
            print(f"Saved: {output_path}")
            break

        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]

In [5]:
example_dir = os.path.join(os.getcwd(), "examples")
os.makedirs(example_dir, exist_ok=True)

xml_path = os.path.join("temp_dir", "hmdb_metabolites.xml")
target_accession = "HMDB0000011"
output_path = os.path.join(example_dir, "HMDB0000011.xml")
extract_record_by_accession(xml_path, target_accession, output_path)

Saved: /Users/bailinzhang/Documents/Wu_Lab/Projects/HMDB_v5/examples/HMDB0000011.xml


In [6]:
target_accession = "HMDB0004327"
output_path = os.path.join(example_dir, "HMDB0004327.xml")
extract_record_by_accession(xml_path, target_accession, output_path)

Saved: /Users/bailinzhang/Documents/Wu_Lab/Projects/HMDB_v5/examples/HMDB0004327.xml


In [7]:
def extract_random_metabolites(xml_path, output_path="examples/HMDB_metabolites_examples_10.xml", n=10):
    """Extracts n random <metabolite> records from a large HMDB XML file and writes them
    to an output file with proper XML structure and namespace.
    """
    print("Counting total <metabolite> entries...")
    total = 0
    for _, _ in ET.iterparse(xml_path, events=("end",), tag="{http://www.hmdb.ca}metabolite"):
        total += 1
    print(f"Total metabolites found: {total}")

    selected_indices = set(random.sample(range(total), n))
    print(f"Randomly selected indices: {sorted(selected_indices)}")

    output_elems = []
    context = ET.iterparse(xml_path, events=("end",), tag="{http://www.hmdb.ca}metabolite")
    current_index = 0
    for event, elem in context:
        if current_index in selected_indices:
            xml_str = ET.tostring(elem, pretty_print=True, encoding="unicode")
            output_elems.append(xml_str)
        current_index += 1
        elem.clear()
    print(f"Writing to output file: {output_path}")

    with open(output_path, "w", encoding="utf-8") as f:
        f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        f.write('<hmdb xmlns="http://www.hmdb.ca">\n')
        for entry in output_elems:
            f.write(entry)
        f.write('</hmdb>\n')

    print(f"Done! Extracted {n} records to '{output_path}'")

In [8]:
extract_random_metabolites(xml_path, n=10)

Counting total <metabolite> entries...
Total metabolites found: 217920
Randomly selected indices: [1919, 24414, 40054, 90417, 103310, 106104, 106794, 116454, 162113, 203456]
Writing to output file: examples/HMDB_metabolites_examples_10.xml
Done! Extracted 10 records to 'examples/HMDB_metabolites_examples_10.xml'


## Explore root tag and children

In [9]:
def get_metabolite_tags(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    all_tags = set()
    for metabolite in root.findall("{http://www.hmdb.ca}metabolite"):
        for child in metabolite:
            tag = child.tag.split("}")[-1]
            all_tags.add(tag)
    return all_tags

In [10]:
exp_path = os.path.join("examples", "HMDB_metabolites_examples_10.xml")
first_child_tags = get_metabolite_tags(exp_path)
first_child_tags

{'abnormal_concentrations',
 'accession',
 'average_molecular_weight',
 'bigg_id',
 'biocyc_id',
 'biological_properties',
 'cas_registry_number',
 'chebi_id',
 'chemical_formula',
 'chemspider_id',
 'creation_date',
 'description',
 'diseases',
 'drugbank_id',
 'experimental_properties',
 'fbonto_id',
 'foodb_id',
 'general_references',
 'inchi',
 'inchikey',
 'iupac_name',
 'kegg_id',
 'knapsack_id',
 'metlin_id',
 'monisotopic_molecular_weight',
 'name',
 'normal_concentrations',
 'ontology',
 'pdb_id',
 'phenol_explorer_compound_id',
 'predicted_properties',
 'protein_associations',
 'pubchem_compound_id',
 'secondary_accessions',
 'smiles',
 'spectra',
 'state',
 'status',
 'synonyms',
 'synthesis_reference',
 'taxonomy',
 'traditional_iupac',
 'update_date',
 'version',
 'vmh_id',
 'wikipedia_id'}