In [7]:
from Bio import Entrez
from dotenv import dotenv_values
import json
import pandas as pd
import numpy as np

config = dotenv_values(".env")
Entrez.email = config['email']

In [8]:
def df_cleanup(x):
    if isinstance(x, list): 
        if len(x) == 0: 
            return np.nan
    else: 
        return x 
    

In [9]:
def esearch(db, query):
    handle = Entrez.esearch(db, term=query)
    record = Entrez.read(handle)
    handle.close()

    print(int(record["Count"]))

    return record

def esummary(db, id): 
    handle = Entrez.esummary(db=db, id=id)
    record = Entrez.read(handle)
    handle.close()
    return record

In [10]:
results = esearch('pubmed', '75N93019C00070/AI/NIAID NIH HHS/United States[Grants and Funding]')
uid_list = results['IdList']
summaries = esummary('pubmed', ','.join(uid_list))
len(summaries)

17


17

In [11]:
summaries = pd.DataFrame(summaries)

In [12]:
summaries = summaries.map(lambda x: df_cleanup(x)).replace('', np.nan).dropna(how='all', axis = 1)
print(summaries.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17 entries, 0 to 16
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Id               17 non-null     object
 1   PubDate          17 non-null     object
 2   EPubDate         14 non-null     object
 3   Source           17 non-null     object
 4   LastAuthor       17 non-null     object
 5   Title            17 non-null     object
 6   Volume           13 non-null     object
 7   Issue            12 non-null     object
 8   Pages            13 non-null     object
 9   NlmUniqueID      17 non-null     object
 10  ISSN             9 non-null      object
 11  ESSN             17 non-null     object
 12  RecordStatus     17 non-null     object
 13  PubStatus        13 non-null     object
 14  ArticleIds       17 non-null     object
 15  DOI              17 non-null     object
 16  History          17 non-null     object
 17  HasAbstract      17 non-null     int6

In [14]:
summaries.head()

Unnamed: 0,Id,PubDate,EPubDate,Source,LastAuthor,Title,Volume,Issue,Pages,NlmUniqueID,...,RecordStatus,PubStatus,ArticleIds,DOI,History,HasAbstract,PmcRefCount,FullJournalName,ELocationID,SO
0,38995971,2024 Jul,2024 Jul 12,PLoS Comput Biol,Pienaar E,Agent-based model predicts that layered struct...,20.0,7.0,e1012266,101238922,...,PubMed - indexed for MEDLINE,,"{'medline': [], 'pubmed': ['38995971'], 'pmc':...",10.1371/journal.pcbi.1012266,"{'medline': ['2024/07/30 18:41'], 'pubmed': ['...",1,68,PLoS computational biology,doi: 10.1371/journal.pcbi.1012266,2024 Jul;20(7):e1012266
1,38912839,2024 Aug 1,,J Immunol,Urdahl KB,Reappraising the Role of T Cell-Derived IFN-γ ...,213.0,3.0,339-346,2985117R,...,PubMed - indexed for MEDLINE,ppublish,"{'medline': [], 'pubmed': ['38912839'], 'pmc':...",10.4049/jimmunol.2400145,"{'medline': ['2024/07/15 18:43'], 'pubmed': ['...",1,60,"Journal of immunology (Baltimore, Md. : 1950)",doi: 10.4049/jimmunol.2400145,2024 Aug 1;213(3):339-346
2,38826239,2024 May 26,2024 May 26,bioRxiv,Rothchild AC,Absence of c-Maf and IL-10 enables Type I IFN ...,,,,101680187,...,PubMed,epublish,"{'medline': [], 'pubmed': ['38826239'], 'pmc':...",10.1101/2024.05.22.594428,"{'medline': ['2024/06/03 06:43'], 'pubmed': ['...",1,72,bioRxiv : the preprint server for biology,pii: 2024.05.22.594428. doi: 10.1101/2024.05.2...,2024 May 26;
3,38659794,2024 Apr 15,2024 Apr 15,bioRxiv,Urdahl KB,CD4-mediated immunity shapes neutrophil-driven...,,,,101680187,...,PubMed,epublish,"{'medline': [], 'pubmed': ['38659794'], 'pmc':...",10.1101/2024.04.12.589315,"{'medline': ['2024/04/25 06:50'], 'pubmed': ['...",1,67,bioRxiv : the preprint server for biology,pii: 2024.04.12.589315. doi: 10.1101/2024.04.1...,2024 Apr 15;
4,38617280,2024 Apr 5,2024 Apr 5,bioRxiv,Urdahl K,Re-appraising the role of T-cell derived inter...,,,,101680187,...,PubMed,epublish,"{'medline': [], 'pubmed': ['38617280'], 'pmc':...",10.1101/2024.04.04.588086,"{'medline': ['2024/04/15 06:44'], 'pubmed': ['...",1,57,bioRxiv : the preprint server for biology,pii: 2024.04.04.588086. doi: 10.1101/2024.04.0...,2024 Apr 5;


In [15]:
import requests
import re
# full text: 
pmid = "31350281"
url = f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/{pmid}/ascii"
results = requests.get(url)
results.status_code 
print(results.text)
results_json = json.loads(results.text)
# if title then start section
# if paragraph, add to title section
# add to sections list

sections = []

method = []
paragraphs = []

method_name = None
new_paragraph = None
temp = {
    'title': None, 
    'paragraphs': []
}


for i in results_json[0]['documents'][0]['passages']: 
    # print(i)
    section_type = i['infons']['section_type']
    # get title
    if section_type == "METHODS": 
        if bool(re.search('title', i['infons']['type'])): 
            if len(temp['paragraphs']) > 0 and temp['title'] is not None:
                temp['paragraphs'] = ''.join(temp['paragraphs'])
                sections.append(temp)
                temp = {
                    'title': None, 
                    'paragraphs': []
                }
                
            temp['title'] = i['text']
            temp['paragraphs'] = []
            
        if i['infons']['type'] == 'paragraph': 
            temp['paragraphs'].append(i['text'])

In [48]:
sections[0]['paragraphs'] 

"C57BL/6 and Nrf2-/- (B6.129X1-Nfe2l2tm1Ywk/J) mice were purchased from Jackson Laboratories (Bar Harbor, ME). Nrf2floxed (C57BL/6-Nfe2l2tm1.1Sred/SbisJ), CD11ccre (B6.Cg-Tg(Itgax-cre)1-1Reiz/J) and LysMcre (B6.129P2-Lyz2tm1(cre)Ifo/J) were purchased from Jackson Laboratories and bred to generate Nrf2CD11c and Nrf2LysM mice. Mice were housed and maintained in specific pathogen-free conditions at Seattle Children's Research Institute and experiments were performed in compliance with the Institutional Animal Care and Use Committee. 6-12 week old male and female mice were used for all experiments, except for RNA-sequencing, which used only female mice for uniformity. Mice infected with M.tb. were housed in a Biosafety Level 3 facility in an Animal Biohazard Containment Suite."

# Search by bioproject

In [122]:
from constants import databases

# Get Relevant Links

In [201]:
import requests
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
endpoint = "elink.fcgi"

def get_rev_links(db_from, db_to, db_id): 

    params = {
        'dbfrom': db_from, 
        'db': db_to, 
        'id': db_id, 
        'idtype': 'acc'
    }

    response = requests.get(base_url + endpoint, params=params)
    if response.status_code == 200: 
        print('Success!')

        root = ET.fromstring(response.text)

        for doc in root.findall(".//LinkSetDb"):
            data = {
                'dbFrom': db_from,
                'id': db_id,
                'dbTo': db_to,
                'links': [d.findtext('Id') for d in doc.findall(".//Link")]
            }
            return data
    else: 
        print(f'Status code: {response.status_code}')

In [202]:
metadata = []
for i in summaries['Id']:
    for db in ['bioproject', 'gds']:
        results = get_rev_links('pubmed', db, i)
        if results is not None: 
            metadata.append(results)

Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Status code: 429
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Status code: 429
Success!
Success!
Success!
Success!
Success!


In [205]:
rev_links = pd.DataFrame(metadata).explode('links')
rev_links = rev_links.sort_values(by = ['id', 'dbTo'])
rev_links

Unnamed: 0,dbFrom,id,dbTo,links
6,pubmed,33142108,bioproject,666628
7,pubmed,33142108,gds,200158807
4,pubmed,33711270,bioproject,554247
5,pubmed,33711270,gds,200134186
2,pubmed,34077724,bioproject,682412
3,pubmed,34077724,gds,200162620
0,pubmed,38236787,bioproject,874454
0,pubmed,38236787,bioproject,874453
0,pubmed,38236787,bioproject,874452
0,pubmed,38236787,bioproject,874451


In [213]:
rev_links.to_csv('./data/pubmed_links_20240828.csv')