Get metadata on publications and associated data

In [1]:
from Bio import Entrez
import json
from dotenv import dotenv_values
config = dotenv_values('.env')
Entrez.email = config['email']
import pickle
from datetime import datetime
today = datetime.now().strftime("%Y-%m-%d")

In [2]:
grant = "75N93019C00070/AI/NIAID NIH HHS/United States[Grants and Funding]"

In [3]:
# Use Entrez to search PubMed
handle = Entrez.esearch(db="pubmed", term=grant, usehistory="y")
record = Entrez.read(handle)
handle.close()

# Fetch the list of PubMed IDs (PMIDs)
id_list = record["IdList"]
print(f"Found {len(id_list)} papers for the grant.")

Found 17 papers for the grant.


In [77]:
id_list

['38995971', '38912839', '38826239', '38659794', '38617280', '38236787', '38130719', '38011264', '36993415', '35428268', '34215836', '34077724', '33903765', '33711270', '35340357', '33142108', '32895573']

In [87]:
# Fetch details for the first few papers
handle = Entrez.esummary(db="pubmed", id=id_list, rettype="medline", retmode="json")
records = handle.read()
handle.close()

records = json.loads(records.decode('utf-8'))

# Print the metadata
# records

results = records['result']
results.pop('uids')

['38995971',
 '38912839',
 '38826239',
 '38659794',
 '38617280',
 '38236787',
 '38130719',
 '38011264',
 '36993415',
 '35428268',
 '34215836',
 '34077724',
 '33903765',
 '33711270',
 '35340357',
 '33142108',
 '32895573']

In [16]:
# # Fetch details for the first few papers
# handle = Entrez.efetch(db="pubmed", id=id_list[0], rettype="medline", retmode="text")
# records = handle.read()
# handle.close()

# # Print the metadata
# print(records)

In [88]:
import xmltodict

In [99]:
for i in id_list:
    handle = Entrez.efetch(db="pubmed", id=i)
    records = handle.read()
    handle.close()

    # Parse the XML data
    dict_data = xmltodict.parse(records.decode())
    citation = dict_data['PubmedArticleSet']['PubmedArticle']['MedlineCitation']
    print(i)
    try: 
        results[i]['abstract'] = citation['Article']['Abstract']['AbstractText']
    except: 
        print('\tAbstract: ', False)
    try: 
        results[i]['mesh_headings'] = [c for c in citation['MeshHeadingList']['MeshHeading']]
    except: 
        print('\tMesh Headings: ', False)

38995971
38912839
38826239
	Mesh Headings:  False
38659794
	Mesh Headings:  False
38617280
	Mesh Headings:  False
38236787
38130719
38011264
36993415
	Mesh Headings:  False
35428268
34215836
	Mesh Headings:  False
34077724
33903765
	Abstract:  False
33711270
35340357
	Mesh Headings:  False
33142108
32895573


In [100]:
with open(f'./data/publications_{today}.pkl', 'wb') as file: 
    # Use pickle to serialize the object and save it to the file
    pickle.dump(results, file)