In [1]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime
from tqdm.auto import tqdm

# Get XML from Pubmed

## Functions

In [2]:
def search_pubmed(term, num_max_results):
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    params = {'db' : 'pubmed', 
              'term' : term, 
              'retmode' : 'xml', 
              'retmax' : num_max_results} # retmax is max results
    response = requests.get(url, params=params)
    return response.content

In [3]:
def get_pmids_query(pubmed_response_raw):
    root = ET.fromstring(pubmed_response_raw)
    pmids = [id_elem.text for id_elem in root.findall(".//Id")]
    return pmids

In [4]:
def fetch_abstracts(pmids):
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    string_of_ids = ','.join(pmids)
    params = {'db' : 'pubmed', 'id' : string_of_ids, 'retmode' : 'xml', 'rettype' : 'abstract'}
    response = requests.get(url, params=params)
    return response.content

## Work

In [5]:
# pubmed parameters
term = 'asperger'
print(term)
num_max_results = 3000
print(num_max_results)
batch_size = 300
print(batch_size)

asperger
3000
300


In [6]:
# get PMIDs
print(datetime.now())
pmids = get_pmids_query(search_pubmed(term, num_max_results))
print(datetime.now())

2025-08-25 10:33:56.513195
2025-08-25 10:33:57.129535


In [7]:
# in batches, get abstract XML
print(datetime.now())
pmids_batches = [pmids[i:i+batch_size] for i in range(0, len(pmids), batch_size)]
xml_raw_batches = [fetch_abstracts(batch) for batch in tqdm(pmids_batches)]
xml_parsed_batches = [ET.fromstring(batch) for batch in xml_raw_batches]
print(datetime.now())

2025-08-25 10:33:57.135732


  0%|          | 0/10 [00:00<?, ?it/s]

2025-08-25 10:34:15.445318


# Parse XML in batches

## Functions

In [8]:
def get_pmids(xml_parsed):
    pmids = []
    for item in xml_parsed:
        try:
            pmid = item.find(".//PMID").text
        except:
            pmid = ''
        pmids.append(pmid)
    return pmids

In [9]:
def get_elocationids(xml_parsed):
    elocationids = []
    for item in xml_parsed:
        try:
            elocationid_text = item.find(".//Article").find(".//ELocationID").text
            elocationid_type = item.find(".//Article").find(".//ELocationID").attrib.get('EIdType')
            elocationid_full = elocationid_type+': '+elocationid_text
        except:
            elocationid_full = ''
        elocationids.append(elocationid_full)
    return elocationids

In [10]:
def get_titles(xml_parsed):
    titles = []
    for item in xml_parsed:
        try:
            title = item.find(".//Article").find(".//ArticleTitle").text
        except:
            title = ''
        titles.append(title)
    return titles

In [11]:
def get_journals(xml_parsed):
    journals = []
    for item in xml_parsed:
        try:
            journal = item.find(".//Article").find(".//Journal").find(".//Title").text
        except:
            journal = ''
        journals.append(journal)
    return journals

In [12]:
def get_years(xml_parsed):
    years = []
    for item in xml_parsed:
        try:
            year = item.find(".//Article").find(".//Journal").find(".//JournalIssue").find(".//PubDate").find(".//Year").text
        except:
            year = ''
        years.append(year)
    return years

In [13]:
def get_authors(xml_parsed):
    authors = []
    for item in xml_parsed:
        try:
            auth_list = item.find(".//Article").find(".//AuthorList").findall(".//Author")
            name_list = [author.find(".//ForeName").text+' '+author.find(".//LastName").text for author in auth_list]
            authors.append(', '.join(name_list))
        except:
            authors.append('')
    return authors

In [14]:
def get_affiliations(xml_parsed):
    affiliations = []
    for item in xml_parsed:
        try:
            auth_list = item.find(".//Article").find(".//AuthorList").findall(".//Author")
            affiliation_list = [author.find(".//AffiliationInfo").find(".//Affiliation").text for author in auth_list]
            affiliations.append(' '.join(list(set(affiliation_list))))
        except:
            affiliations.append('')
    return affiliations

In [15]:
def get_abstracts(xml_parsed):
    abstracts = []
    for item in xml_parsed:
        try:
            abstract = item.find(".//Article").find(".//Abstract").find(".//AbstractText").text
        except:
            abstract = ''
        abstracts.append(abstract)
    return abstracts

## Work

In [16]:
# loop over batches, then loop over fields
column_name_to_getter = {'pmid' : get_pmids, 'elocationid' : get_elocationids, 
                        'title' : get_titles, 'journal' : get_journals, 'year' : get_years, 
                        'author' : get_authors, 'affiliation' : get_affiliations, 
                        'abstract' : get_abstracts}
df_batches = [pd.DataFrame({column_name : getter(batch) \
for (column_name, getter) in column_name_to_getter.items()}) \
for batch in xml_parsed_batches]

# CSV file

In [17]:
# final dataframe
df = pd.concat(df_batches)
df

Unnamed: 0,pmid,elocationid,title,journal,year,author,affiliation,abstract
0,40770710,pii: 769,The dynamic trajectory of autistic life and it...,BMC psychiatry,2025,"Leshata Winter Mokhwelepa, Gsakani Olivia Sumb...","School of Medicine, Faculty of Health Science,...",There is a noticeable knowledge vacuum on the ...
1,40690320,,[Quantitative analysis of autism online forums].,Psychiatria Hungarica : A Magyar Pszichiatriai...,2025,"Brigitta Kakuszi, Szilvia Hetesy, Pál Czobor",,Social media platforms are becoming increasing...
2,40637642,doi: 10.5152/TurkArchPediatr.2025.25127,The Work of Grunya Efimovna Sukhareva in the F...,Turkish archives of pediatrics,2025,"Annio Posar, Paola Visconti",IRCCS Istituto delle Scienze Neurologiche di B...,Despite several articles that in recent years ...
3,40597832,pii: 484,Autism spectrum disorders and childhood caries...,BMC pediatrics,2025,"Qiufang Jin, Zexiu He, Dongfang Xu, Ruihua Lin...","Department of Otolaryngology, The Second Hospi...",This study aimed to investigate the causal rel...
4,40527486,pii: S0021-7557(25)00099-3,Psychometric characteristics of the Mini-TEA s...,Jornal de pediatria,2025,"Cassiano Mateus Forcelini, Regina Ampese, Hele...",Associação de Pais e Amigos dos Excepcionais (...,Early diagnosis of autism spectrum disorder (A...
...,...,...,...,...,...,...,...,...
295,7866673,,"Linguistics, human communication and psychiatry.",The British journal of psychiatry : the journa...,1994,"P Thomas, W Fraser",,Psycholinguistics and sociolinguistics have ex...
296,7794327,,Prevalence of Asperger's syndrome in a secure ...,The British journal of psychiatry : the journa...,1994,"P Scragg, A Shah",,The hypothesis that Asperger's syndrome (AS) m...
297,29871460,doi: 10.1007/BF01978114,A preliminary study of right hemisphere cognit...,European child & adolescent psychiatry,1994,"Hadyn D Ellis, Diane M Ellis, William Fraser, ...","Department of Psychological Medicine, Universi...",Seven children and young adults with definite ...
298,7926319,,Developmental prosopagnosia in Asperger syndro...,Developmental medicine and child neurology,1994,I Kracke,"Department of Psychology, Hollymoor Hospital, ...",The case of a young man is presented who initi...


In [18]:
# write CSV file
df.to_csv('../data/data-kb.csv', sep='\t', index=False) # use tab as separator

In [19]:
print(datetime.now())

2025-08-25 10:34:15.674774
