In [1]:

from Bio import Entrez
import time
import json
import xml.etree.ElementTree as ET


In [42]:
# Replace with your email (required by NCBI)
Entrez.email = "your_email@solarahealth.org"

def search_pubmed(term, max_results=3):
    """Searches PubMed and returns a list of PMIDs."""
    handle = Entrez.esearch(db="pmc", term=term, retmax=max_results)
    record = Entrez.read(handle)
    return record["IdList"]

def extract_title_from_xml(xml_text):
    """Extract article title from PMC XML."""
    try:
        # Parse the XML
        root = ET.fromstring(xml_text)
        title_temp = ""
        # Look for title in various possible locations
        # Method 1: article-title in article-meta
        title_elem = root.find('.//article-title')
        if title_elem is not None:
            title_temp =  ''.join(title_elem.itertext()).strip()
        
        # Method 2: title-group
        title_group = root.find('.//title-group/article-title')
        if title_group is not None:
            title_temp = ''.join(title_group.itertext()).strip()
        
        # Method 3: Look in front matter
        front_title = root.find('.//front//article-title')
        if front_title is not None:
            title_temp = ''.join(front_title.itertext()).strip()
        
        return title_temp.replace("/", "_").replace("\\", "_").replace(" ", "_")
        
    except ET.ParseError as e:
        return f"XML parsing error: {e}"
    except Exception as e:
        return f"Error extracting title: {e}"

def fetch_articles(pmid_list):
    """Fetches article summaries (title, abstract) from PubMed."""
    ids = ",".join(pmid_list)
    with Entrez.efetch(db="pmc", id=ids, rettype="full", retmode="full") as handle:
        return handle.read()

def fetch_single_article(pmid):
    """Fetches a single article from PubMed."""
    with Entrez.efetch(db="pmc", id=pmid, rettype="medline", retmode="text") as handle:
        return handle.read()

# def fetch_articles_detailed(pmid_list):
#     """Fetch article metadata using efetch in medline format."""
#     ids = ",".join(pmid_list)
#     with Entrez.efetch(db="pmc", id=ids, rettype="medline", retmode="text") as handle:
#         return handle.read()

def save_results(results_text, filename):
    with open(filename, "w", encoding="utf-8") as f:
        f.write(results_text)

In [43]:

# Example 1: Drug-focused search
query = "Metformin GLP-1 Type 2 Diabetes"

# You can also try:
# query = "HbA1c monitoring procedure"
# query = "cardiovascular outcomes AND obesity"
# query = "retinopathy AND screening AND diabetes"

print(f"🔍 Searching PMC for: {query}")
ids = search_pubmed(query, max_results=3)
print(f"🔗 Found {len(ids)} articles")

🔍 Searching PMC for: Metformin GLP-1 Type 2 Diabetes
🔗 Found 3 articles


In [44]:
article_id = ids[0]
print(article_id)

12198475


In [45]:
print(f"Fetching article {article_id}")
xml_text = fetch_single_article(article_id)
title = extract_title_from_xml(xml_text)
print(f"Title: {title}")
# save_results(xml_text, f"articles/{id}-{title}.xml")
# print(f"Saved article {article_id} to {title}.xml")

Fetching article 12198475
Title: XML parsing error: syntax error: line 1, column 0


In [47]:
print(xml_text)

IS  - 1552-5260 (Print)
IS  - 1552-5279 (Electronic)
VI  - 21
IP  - 6
DP  - 2025 Jun
TI  - Effects of the SGLT2 inhibitor dapagliflozin in early Alzheimer's disease: A 
      randomized controlled trial.
LID - e70416
AB  - INTRODUCTION: Due to its metabolic effects, dapagliflozin, a sodium‐glucose 
      transporter 2 (SGLT2) inhibitor, holds potential as an Alzheimer's disease (AD) 
      therapeutic. METHODS: We conducted a double‐blind, randomized, 
      placebo‐controlled, parallel‐group, 12‐week single‐site study to investigate the 
      effect of dapagliflozin in participants with probable AD (Mini‐Mental State 
      Examination [MMSE] score 15–26). We planned to enroll 48 participants with 2:1 
      randomization to 10 mg dapagliflozin once daily (n = 32) versus matching placebo 
      (n = 16). The primary objective was the effect of dapagliflozin on cerebral 
      N‐acetylaspartate (NAA). We also assessed safety, glycemic control, body 
      composition, brain metabolism

In [39]:
with open("articles/12186413-The_role_of_myocardial_fibrosis_in_the_diabetic_cardiomyopathy.xml", "rb") as f:
    text = Entrez.read(f)

In [40]:
text

[DictElement({'front': {'list': [], 'def-list': [], 'ack': [], 'fn-group': [], 'notes': [], 'bio': [], 'glossary': [], 'journal-meta': {'self-uri': [], 'journal-title-group': [{'journal-subtitle': [], 'trans-title-group': [], 'journal-title': ['Diabetology & Metabolic Syndrome'], 'abbrev-journal-title': []}], 'journal-id': [StringElement('Diabetol Metab Syndr', attributes={'journal-id-type': 'nlm-ta'}), StringElement('Diabetol Metab Syndr', attributes={'journal-id-type': 'iso-abbrev'}), StringElement('964', attributes={'journal-id-type': 'pmc-domain-id'}), StringElement('diametsyn', attributes={'journal-id-type': 'pmc-domain'})], 'contrib-group': [], 'issn': [StringElement('1758-5996', attributes={'pub-type': 'epub'})], 'aff-alternatives': [], 'isbn': [], 'aff': [], 'notes': [], 'publisher': [['BMC']]}, 'article-meta': {'related-object': [], 'issue-id': [StringElement('478402', attributes={'pub-id-type': 'pmc-issue-id'})], 'contrib-group': [[ListElement([DictElement({'surname': 'Sun', 