# Pubmed Medline parsing xml files using E-utilities

In [None]:
import csv
import re
import urllib
from time import sleep

# computational[All Fields] AND ("neurosciences"[MeSH Terms] OR "neurosciences"[All Fields] OR "neuroscience"[All Fields])

# https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=computational+neuroscience&usehistory=y&rettype=json

query = "computational+neuroscience"

# common settings between esearch and efetch
base_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'
db = 'db=pubmed'

# esearch settings
search_eutil = 'esearch.fcgi?'
search_term = '&term=' + query
search_usehistory = '&usehistory=y'
search_rettype = '&rettype=json'

# call the esearch command for the query and read the web result
search_url = base_url+search_eutil+db+search_term+search_usehistory+search_rettype
print("this is the esearch command:\n" + search_url + "\n")
f = urllib.request.urlopen (search_url)
search_data = f.read().decode('utf-8')

# extract the total abstract count
total_abstract_count = int(re.findall("<Count>(\d+?)</Count>",search_data)[0])

# efetch settings
fetch_eutil = 'efetch.fcgi?'
retmax = 20
retstart = 0
fetch_retmode = "&retmode=xml"
fetch_rettype = "&rettype=abstract"

# obtain webenv and querykey settings from the esearch results
fetch_webenv = "&WebEnv=" + re.findall ("<WebEnv>(\S+)<\/WebEnv>", search_data)[0]
fetch_querykey = "&query_key=" + re.findall("<QueryKey>(\d+?)</QueryKey>",search_data)[0]

# call efetch commands using a loop until all abstracts are obtained
run = True
all_abstracts = list()
loop_counter = 1

while run:
    print("this is efetch run number " + str(loop_counter))
    loop_counter += 1
    fetch_retstart = "&retstart=" + str(retstart)
    fetch_retmax = "&retmax=" + str(retmax)
    # create the efetch url
    fetch_url = base_url+fetch_eutil+db+fetch_querykey+fetch_webenv+fetch_retstart+fetch_retmax+fetch_retmode+fetch_rettype
    print(fetch_url)
    # open the efetch url
    f = urllib.request.urlopen (fetch_url)
    fetch_data = f.read().decode('utf-8')
    # split the data into individual abstracts
    abstracts = fetch_data.split("<")
    # append to the list all_abstracts
    all_abstracts = all_abstracts+abstracts
    print("a total of " + str(len(all_abstracts)) + " abstracts have been downloaded.\n")
    # wait 2 seconds so we don't get blocked
    sleep(2)
    # update retstart to download the next chunk of abstracts
    retstart = retstart + retmax
    if retstart > total_abstract_count:
        run = False

In [None]:
len(all_abstracts)

In [None]:
print(all_abstracts[0])

# Parsing the XML 

In [2]:
import xml.etree.ElementTree as ET
from urllib.request import urlopen

def chunker(seq, size):
    return (seq[pos:pos + size] for pos in xrange(0, len(seq), size))

# https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&query_key=1&WebEnv=MCID_62bba0354f090a7ee705b3b0&retstart=0&retmax=20&retmode=xml&rettype=abstract

# xml id = <PMID Version="1">35760525</PMID>
# Gives same result

efetch = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?&db=pubmed&term=computational+neuroscience&usehistory=y&retmode=xml&id=%s" % ('35760525')
handle = urlopen(efetch)
data = handle.read()

root = ET.fromstring(data)
for article in root.findall("PubmedArticle"):
    pmid = article.find("MedlineCitation/PMID").text
    year = article.find("MedlineCitation/Article/Journal/JournalIssue/PubDate/Year")
    if year is None: year = 'NA'
    else: year = year.text
    aulist = article.findall("MedlineCitation/Article/AuthorList/Author")
    affiliation = article.find("MedlineCitation/Article/AuthorList/Author/AffiliationInfo")
    
    PublicationTypeList = article.findall("MedlineCitation/Article/PublicationTypeList")
    typepub = article.find("MedlineCitation/Article/PublicationTypeList/PublicationType")
    #print(pmid, year, len(aulist), affiliation, aulist, ET.dump(root))
    print(len(aulist))
    for author in aulist:    
        print(ET.dump(author))
        
    print("-----------------------------------------------------")    
    print(len(PublicationTypeList))
    for journal_type in PublicationTypeList:
        print(ET.dump(journal_type))
        
        
        
    
    ArticleTitle = article.findall("MedlineCitation/Article/ArticleTitle")
    #typepub = article.find("MedlineCitation/Article/PublicationTypeList/PublicationType")
    print("-----------------------------------------------------")    
    print(len(ArticleTitle))
    for title in ArticleTitle:
        print(ET.dump(title))
        
    
    ArticleAbstract = article.findall("MedlineCitation/Article/Abstract/AbstractText")
    print("-----------------------------------------------------")    
    print(len(ArticleAbstract))
    for Abstract in ArticleAbstract:
        print(ET.dump(Abstract))
        
    
    
    PMID = article.findall("MedlineCitation/PMID")
    print("-----------------------------------------------------")    
    print(len(PMID))
    for pmid in PMID:
        print(ET.dump(pmid))
    

4
<Author ValidYN="Y"><LastName>Barthélemy</LastName><ForeName>Frédéric V</ForeName><Initials>FV</Initials><AffiliationInfo><Affiliation>Institut de Neurosciences de la Timone, UMR7289, CNRS &amp; Aix-Marseille Université, 13385 Marseille, France.</Affiliation></AffiliationInfo></Author>
None
<Author ValidYN="Y"><LastName>Fleuriet</LastName><ForeName>Jérome</ForeName><Initials>J</Initials><AffiliationInfo><Affiliation>Institut de Neurosciences de la Timone, UMR7289, CNRS &amp; Aix-Marseille Université, 13385 Marseille, France.</Affiliation></AffiliationInfo><AffiliationInfo><Affiliation>Assistance Publique-Hôpitaux de Paris, Intensive Care Unit, Raymond Poincaré Hospital, Garches, France.</Affiliation></AffiliationInfo></Author>
None
<Author ValidYN="Y"><LastName>Perrinet</LastName><ForeName>Laurent U</ForeName><Initials>LU</Initials><Identifier Source="ORCID">https://orcid.org/0000-0002-9536-010X</Identifier><AffiliationInfo><Affiliation>Institut de Neurosciences de la Timone, UMR7289

# References

1. [NCBI E-utilities (Entrez Programming Utilities)](https://www.ncbi.nlm.nih.gov/books/NBK25499/)

2. [Pubmed e-utility from April 2022 new version (last update before this was in 2020)](https://www.youtube.com/watch?v=aETx4MyXukk)
