## Step 1: Retrive the list of PubmedID and PMC IDs about a particular Disease 

### Use Entrez to Search PubMed and Fetch Details ( Using E-utilities API.)

In [35]:
from Bio import Entrez
import requests
import json
import requests
import json
import os
import time  
import pandas as pd
import yaml


# Always provide your email when using NCBI's API
Entrez.email = "enockniyonkuru250@gmail.com"
KEY = "a0de528ff3ee8d27c192c9106d0f75396008"

In [43]:
def fetch_mesh_ids(pmid, retries=3, delay=2):
    """
    Fetch MeSH IDs and their descriptor names for a given PubMed ID.

    Parameters:
    - pmid: PubMed ID.
    - retries: Number of retries in case of an HTTP error.
    - delay: Delay in seconds before retrying.

    Returns:
    - Dictionary with MeSH IDs as keys and descriptor names as values.
    """
    for attempt in range(retries):
        try:
            handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
            article = Entrez.read(handle)
            handle.close()

            # Extract MeSH IDs and descriptor names from the article
            mesh_info = {}
            if 'PubmedArticle' in article:
                mesh_heading_list = article['PubmedArticle'][0].get('MedlineCitation', {}).get('MeshHeadingList', [])
                for mesh_heading in mesh_heading_list:
                    descriptor_name = mesh_heading.get('DescriptorName')
                    if descriptor_name:
                        mesh_id = descriptor_name.attributes.get('UI')
                        if mesh_id:
                            mesh_info[mesh_id] = str(descriptor_name)

            return mesh_info
        except Exception as e:
            print(f"Error fetching MeSH IDs for PMID {pmid}: {e}. Attempt {attempt + 1} of {retries}. Retrying in {delay} seconds.")
            time.sleep(delay)
    raise Exception(f"Failed to fetch MeSH IDs for PMID {pmid} after {retries} attempts.")

def search_articles_with_mesh_info(disease_name: str, mesh_list_path: str, max_pmid_retrieve: int):
    """
    Search articles related to a disease and treatments using combined MeSH IDs from a .tsv file, 
    and return their PubMed IDs along with the MeSH IDs and their descriptor names that contributed to their selection.
    
    Parameters:
    - disease_name: Name of the disease.
    - mesh_list_path: Path to the .tsv file containing MeSH IDs.
    - max_pmid_retrieve: Maximum number of PubMed IDs to retrieve.
    
    Returns:
    - Dictionary with PubMed IDs as keys and a dictionary of contributing MeSH IDs and descriptor names as values.
    """
    # Read the .tsv file and create a set of combined MeSH IDs
    df = pd.read_csv(mesh_list_path, sep='\t', header=None)
    
    combined_mesh_ids = {mesh_id for mesh_ids in df.iloc[:, 2] for mesh_id in mesh_ids.split(';')}

    # Combine disease name and treatment search terms
    treatment_search = "(diagnosis[MeSH Terms] OR therapeutics[MeSH Terms])"
    search_term = f"({disease_name}[Title/Abstract]) AND ({treatment_search})"

    # Perform the search
    handle = Entrez.esearch(db="pubmed", term=search_term, retmax=max_pmid_retrieve)
    record = Entrez.read(handle)
    handle.close()

    # Retrieve MeSH IDs and descriptor names for each PubMed ID and filter based on the combined mesh set
    pmid_mesh_info = {}
    for pmid in record["IdList"]:
        mesh_info = fetch_mesh_ids(pmid)
        filtered_mesh_info = {mesh_id: descriptor_name for mesh_id, descriptor_name in mesh_info.items() if mesh_id in combined_mesh_ids}
        pmid_mesh_info[pmid] = filtered_mesh_info

    return pmid_mesh_info


In [46]:
disease_name = "sickle cell"
mesh_list_path = "/Users/niyone/Desktop/maxo/automaxo/data/mesh_sets.tsv"  
max_pmid_retrieve = 5
results = search_articles_with_mesh_info(disease_name, mesh_list_path, max_pmid_retrieve)



In [47]:
print(results)

{'38429041': {'D059408': 'Pain Management'}, '38428445': {'D010360': 'Patient Transfer'}, '38422317': {'D001803': 'Blood Transfusion'}, '38418238': {'D041623': 'Tomography, Optical Coherence'}, '38409146': {'D011296': 'Prenatal Diagnosis', 'D015193': 'Chorionic Villi Sampling', 'D003955': 'Diagnostic Tests, Routine'}}


In [50]:
pmids_list = list(results.keys())

In [51]:
pmids_list

['38429041', '38428445', '38422317', '38418238', '38409146']

In [10]:
pmid = "36776881"

In [11]:
# Retrieve MeSH terms for each PubMed ID
pmid_mesh_info = {}
handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
article = Entrez.read(handle)
handle.close()

In [9]:
type(article)

Bio.Entrez.Parser.DictionaryElement

In [68]:
article

{'PubmedBookArticle': [], 'PubmedArticle': [{'MedlineCitation': DictElement({'GeneralNote': [], 'SpaceFlightMission': [], 'OtherAbstract': [], 'InvestigatorList': [], 'OtherID': [], 'KeywordList': [ListElement([StringElement('COVID-19', attributes={'MajorTopicYN': 'N'}), StringElement('Janus kinase inhibitors', attributes={'MajorTopicYN': 'N'}), StringElement('antiviral agents', attributes={'MajorTopicYN': 'N'}), StringElement('neutralizing antibody therapy', attributes={'MajorTopicYN': 'N'}), StringElement('treatments', attributes={'MajorTopicYN': 'N'})], attributes={'Owner': 'NOTNLM'})], 'CitationSubset': ['IM'], 'PMID': StringElement('36776881', attributes={'Version': '1'}), 'DateCompleted': {'Year': '2023', 'Month': '02', 'Day': '14'}, 'DateRevised': {'Year': '2023', 'Month': '03', 'Day': '28'}, 'Article': DictElement({'Language': ['eng'], 'ArticleDate': [DictElement({'Year': '2023', 'Month': '01', 'Day': '26'}, attributes={'DateType': 'Electronic'})], 'ELocationID': [StringElement

In [69]:
article.keys()

dict_keys(['PubmedBookArticle', 'PubmedArticle'])

In [70]:
PubmedArticle = (article.get('PubmedArticle'))[0]
print(PubmedArticle.keys())

dict_keys(['MedlineCitation', 'PubmedData'])


In [None]:
mesh_heading_list = article.get('PubmedArticle')[0].get('MedlineCitation').get('MeshHeadingList')
# mesh_heading_list = article.get('PubmedArticle')[0].get('MedlineCitation', {}).get('MeshHeadingList', [])
for mesh_heading in mesh_heading_list:
    descriptor_name = mesh_heading.get('DescriptorName')
    if descriptor_name:
        mesh_id = descriptor_name.attributes.get('UI')
        print(mesh_id)
        mesh_ids.append(mesh_id)
        # if mesh_id in formatted_mesh_ids:
        #     mesh_ids.append(mesh_id)

In [71]:
print(article.get('PubmedArticle')[0].get('MedlineCitation').get('MeshHeadingList')[3].get('DescriptorName'))

COVID-19 Vaccines


In [72]:
MeshHeadingList = article.get('PubmedArticle')[0].get('MedlineCitation').get('MeshHeadingList')

In [73]:
MeshHeadingList = article.get('PubmedArticle')[0].get('MedlineCitation').get('MeshHeadingList')
mesh_term = (MeshHeadingList[1])
print(mesh_term)

{'QualifierName': [], 'DescriptorName': StringElement('COVID-19', attributes={'UI': 'D000086382', 'MajorTopicYN': 'Y'})}


In [33]:
import yaml

In [9]:

pmid_mesh_info = {}
pmid = "12345678"  # Replace with the actual PubMed ID you want to retrieve

handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
article = Entrez.read(handle)
handle.close()

# Extract MeSH IDs from the article
mesh_ids = []
mesh_heading_list = article.get('PubmedArticle')[0].get('MedlineCitation').get('MeshHeadingList', [])
for mesh_heading in mesh_heading_list:
    descriptor_name = mesh_heading.get('DescriptorName')
    if descriptor_name:
        mesh_id = descriptor_name.attributes.get('UI')
        if mesh_id:
            mesh_ids.append(mesh_id)

pmid_mesh_info[pmid] = mesh_ids

# Save the dictionary as a YAML file
with open('temp_pmid_mesh_info.yaml', 'w') as yaml_file:
    yaml.dump(pmid_mesh_info, yaml_file, default_flow_style=False)


In [35]:

pmid = "36776881"
# Retrieve MeSH terms for each PubMed ID
pmid_mesh_info = {}
handle = Entrez.efetch(db="pubmed", id=pmid, retmode="xml")
article = Entrez.read(handle)
handle.close()

# Save the dictionary as a YAML file
with open('test_pmid_mesh_info.yaml', 'w') as yaml_file:
    yaml.dump(article, yaml_file, default_flow_style=False)


In [30]:
article

{'PubmedBookArticle': [], 'PubmedArticle': [{'MedlineCitation': DictElement({'GeneralNote': [], 'SpaceFlightMission': [], 'CitationSubset': ['IM'], 'InvestigatorList': [], 'OtherAbstract': [], 'OtherID': [], 'KeywordList': [ListElement([StringElement('COVID-19', attributes={'MajorTopicYN': 'N'}), StringElement('Janus kinase inhibitors', attributes={'MajorTopicYN': 'N'}), StringElement('antiviral agents', attributes={'MajorTopicYN': 'N'}), StringElement('neutralizing antibody therapy', attributes={'MajorTopicYN': 'N'}), StringElement('treatments', attributes={'MajorTopicYN': 'N'})], attributes={'Owner': 'NOTNLM'})], 'PMID': StringElement('36776881', attributes={'Version': '1'}), 'DateCompleted': {'Year': '2023', 'Month': '02', 'Day': '14'}, 'DateRevised': {'Year': '2023', 'Month': '03', 'Day': '28'}, 'Article': DictElement({'Language': ['eng'], 'ELocationID': [StringElement('1125246', attributes={'EIdType': 'pii', 'ValidYN': 'Y'}), StringElement('10.3389/fimmu.2023.1125246', attributes=

In [31]:
# Extract MeSH terms from the article
mesh_terms = []
if isinstance(article, list) and len(article) > 0:
    if 'MedlineCitation' in article[0] and 'MeshHeadingList' in article[0]['MedlineCitation']:
        for mesh_heading in article[0]['MedlineCitation']['MeshHeadingList']:
            if 'DescriptorName' in mesh_heading:
                mesh_term = mesh_heading['DescriptorName']
                # if mesh_term.attributes.get('UI') in formatted_mesh_ids:
                #     mesh_terms.append((mesh_term.attributes['UI'], str(mesh_term)))

pmid_mesh_info[pmid] = mesh_terms

In [32]:
pmid_mesh_info

{'36776881': []}

In [17]:
article

{'PubmedBookArticle': [], 'PubmedArticle': [{'MedlineCitation': DictElement({'GeneralNote': [], 'SpaceFlightMission': [], 'CitationSubset': ['IM'], 'InvestigatorList': [], 'OtherAbstract': [], 'OtherID': [], 'KeywordList': [], 'PMID': StringElement('30325033', attributes={'Version': '1'}), 'DateCompleted': {'Year': '2019', 'Month': '04', 'Day': '09'}, 'DateRevised': {'Year': '2020', 'Month': '03', 'Day': '09'}, 'Article': DictElement({'Language': ['eng'], 'ELocationID': [StringElement('10.1111/trf.14901', attributes={'EIdType': 'doi', 'ValidYN': 'Y'})], 'ArticleDate': [DictElement({'Year': '2018', 'Month': '10', 'Day': '16'}, attributes={'DateType': 'Electronic'})], 'Journal': {'ISSN': StringElement('1537-2995', attributes={'IssnType': 'Electronic'}), 'JournalIssue': DictElement({'Volume': '58', 'Issue': '12', 'PubDate': {'Year': '2018', 'Month': 'Dec'}}, attributes={'CitedMedium': 'Internet'}), 'Title': 'Transfusion', 'ISOAbbreviation': 'Transfusion'}, 'ArticleTitle': 'Red blood cell 

In [10]:
results

{'38409769': [],
 '38361007': [],
 '38335157': [],
 '38306546': [],
 '38260087': []}

In [3]:

def search_articles(disease_name, max_results=200):
    """
    Search articles related to a disease and return their PubMed IDs.
    """
    search_term = f"{disease_name}[Title/Abstract]"
    handle = Entrez.esearch(db="pubmed", term=search_term, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

def fetch_pmc_ids(pmids_list):
    """
    Fetch PMC IDs corresponding to a list of PubMed IDs, and return three lists:
    - pmids_without_pmc: PubMed IDs without corresponding PMC IDs
    - pmc_ids_list: PMC IDs in the format 'PMC12345'
    - pmids_with_pmc: PubMed IDs with corresponding PMC IDs
    """
    links = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmids_list, linkname="pubmed_pmc")
    records = Entrez.read(links)
    links.close()
    
    pmc_ids_list = []    
    for record in records:
        pmid = record['IdList'][0]
        try:
            pmc_id = 'PMC' + record['LinkSetDb'][0]['Link'][0]['Id']
            pmc_ids_list.append(pmc_id)
        except IndexError:  # This PMID does not have a corresponding PMC ID
            continue
    
    # Convert sets to lists for the return value
    return  pmc_ids_list


In [4]:
# Example usage
disease_name = "Sickle Cell Anemia"
pmids_list = search_articles(disease_name)
pmc_ids_list = fetch_pmc_ids(pmids_list)


In [5]:
print("Length of PM IDs:", len(pmids_list))
print("PM IDs:", pmids_list[:5])

print("Length of PMC IDs:", len(pmc_ids_list))
print("PMC IDs:", pmc_ids_list[:5])



Length of PM IDs: 200
PM IDs: ['38344818', '38336101', '38335412', '38332651', '38326178']
Length of PMC IDs: 113
PMC IDs: ['PMC10834466', 'PMC10838028', 'PMC10825386', 'PMC10807523', 'PMC10809526']


## Step 2: Use Pubtator API to get the Annotated Texts for the selected IDs 

#### Abstract + Title  Text 

In [6]:
def fetch_and_save_abst_json(pmids_list, base_path, max_articles_to_save):
    """
    Fetches full texts for given PMC IDs in biocjson format and saves them as JSON files in the specified folder.
    Tracks the count of articles not found and successfully retrieved. Stops saving articles once a specified limit is reached.
    
    Parameters:
    - pmids_list (list of str): List of PMC IDs.
    - base_path (str): The base directory path where the files will be saved.
    - max_articles_to_save (int): Maximum number of articles to save before ending the function.
    """
    not_found_count = 0
    found_count = 0

    for pmid in pmids_list:
        if found_count >= max_articles_to_save:
            break  # End the loop if the maximum number of articles to save has been reached
        
        api_url = f"https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocjson?pmids={pmid}"
        try_again = True  # Flag to control the retry mechanism
        while try_again:
            response = requests.get(api_url)
            if response.status_code == 200 and response.text.strip():
                # Check if the response is not empty
                try:
                    data = response.json()
                    if data:  # Additional check to ensure JSON is not empty
                        filename = f"{pmid}.json"
                        full_path = os.path.join(base_path, filename)
                        with open(full_path, 'w', encoding='utf-8') as file:
                            json.dump(data, file, indent=4)
                        print(f"Saved full text for PM ID {pmid} in '{full_path}'.")
                        found_count += 1
                    else:
                        print(f"No content for PM ID {pmid}.")
                        not_found_count += 1
                    try_again = False  # No need to retry
                except json.JSONDecodeError:
                    print(f"Failed to decode JSON for PM ID {pmid}.")
                    not_found_count += 1
                    try_again = False  # No need to retry

            elif response.status_code == 429:
                # If HTTP status code is 429, wait for 3 seconds and retry once
                print(f"Rate limit exceeded for PM ID {pmid}. Waiting 3 seconds before retrying...")
                time.sleep(3)  # Wait for 3 seconds
                # After waiting, the loop will try the request again
            else:
                print(f"Failed to fetch full text for PM ID {pmid}: HTTP {response.status_code}")
                not_found_count += 1
                try_again = False  # No need to retry

    print(f"Total articles found and saved: {found_count}")
    print(f"Total articles not found: {not_found_count}")

#### Full Text

In [7]:
def fetch_and_save_fulltext_json(pmc_ids_list, base_path, max_articles_to_save):
    """
    Fetches full texts for given PMC IDs in biocjson format and saves them as JSON files in the specified folder.
    Tracks the count of articles not found and successfully retrieved. Stops saving articles once a specified limit is reached.
    
    Parameters:
    - pmcids (list of str): List of PMC IDs.
    - base_path (str): The base directory path where the files will be saved.
    - max_articles_to_save (int): Maximum number of articles to save before ending the function.
    """
    not_found_count = 0
    found_count = 0

    for pmcid in pmc_ids_list:
        if found_count >= max_articles_to_save:
            print(f"Reached the maximum limit of {max_articles_to_save} articles to save.")
            break  # Stop the function if the maximum number of articles to save has been reached

        api_url = f"https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/pmc_export/biocjson?pmcids={pmcid}"
        try_again = True  # Flag to control the retry mechanism
        while try_again:
            response = requests.get(api_url)
            if response.status_code == 200 and response.text.strip():
                # Check if the response is not empty
                try:
                    data = response.json()
                    if data:  # Additional check to ensure JSON is not empty
                        filename = f"{pmcid}.json"
                        full_path = os.path.join(base_path, filename)
                        with open(full_path, 'w', encoding='utf-8') as file:
                            json.dump(data, file, indent=4)
                        print(f"Saved full text for PMC ID {pmcid} in '{full_path}'.")
                        found_count += 1
                    else:
                        print(f"No content for PMC ID {pmcid}.")
                        not_found_count += 1
                    try_again = False  # No need to retry
                except json.JSONDecodeError:
                    print(f"Failed to decode JSON for PMC ID {pmcid}.")
                    not_found_count += 1
                    try_again = False  # No need to retry

            elif response.status_code == 429:
                # If HTTP status code is 429, wait for 3 seconds and retry once
                print(f"Rate limit exceeded for PMC ID {pmcid}. Waiting 3 seconds before retrying...")
                time.sleep(3)  # Wait for 3 seconds
                # After waiting, the loop will try the request again
            else:
                print(f"Failed to fetch full text for PMC ID {pmcid}: HTTP {response.status_code}")
                not_found_count += 1
                try_again = False  # No need to retry

    print(f"Total articles found and saved: {found_count}")
    print(f"Total articles not found: {not_found_count}")


In [8]:
if __name__ == "__main__":
    base_path = "json_files"
    os.makedirs(base_path, exist_ok=True)
    pmids_list = pmids_list
    pmc_ids_list = pmc_ids_list
    max_articles_to_save = 50
    fetch_and_save_abst_json(pmids_list, base_path, max_articles_to_save)
    fetch_and_save_fulltext_json(pmc_ids_list, base_path, max_articles_to_save)


Failed to fetch full text for PM ID 38344818: HTTP 200
Failed to fetch full text for PM ID 38336101: HTTP 200
Failed to fetch full text for PM ID 38335412: HTTP 200
Failed to fetch full text for PM ID 38332651: HTTP 200
Failed to fetch full text for PM ID 38326178: HTTP 200
Failed to fetch full text for PM ID 38323371: HTTP 200
Rate limit exceeded for PM ID 38319780. Waiting 3 seconds before retrying...
Failed to fetch full text for PM ID 38319780: HTTP 200
Failed to fetch full text for PM ID 38312689: HTTP 200
Failed to fetch full text for PM ID 38310012: HTTP 200
Rate limit exceeded for PM ID 38308541. Waiting 3 seconds before retrying...
Failed to fetch full text for PM ID 38308541: HTTP 200
Failed to fetch full text for PM ID 38307830: HTTP 200
Failed to fetch full text for PM ID 38302294: HTTP 200
Rate limit exceeded for PM ID 38292987. Waiting 3 seconds before retrying...
Failed to fetch full text for PM ID 38292987: HTTP 200
Failed to fetch full text for PM ID 38289232: HTTP 200

In [7]:
import pandas as pd
from Bio import Entrez

In [17]:

def search_articles(disease_name: str, mesh_list_path: str, max_pmid_retrieve: int):
    """
    Search articles related to a disease and treatments using combined MeSH IDs from a .tsv file, and return their PubMed IDs.
    
    Parameters:
    - disease_name: Name of the disease.
    - mesh_list_path: Path to the .tsv file containing MeSH IDs.
    - max_pmid_retrieve: Maximum number of PubMed IDs to retrieve.
    
    Returns:
    - List of PubMed IDs.
    """
    # Read the .tsv file
    df = pd.read_csv(mesh_list_path, sep='\t', header=None)

    # Combine all List_MeSH_IDs from the third column into a single list
    combined_mesh_ids = []
    for mesh_ids in df.iloc[:, 2]:  # Using the third column (index 2)
        combined_mesh_ids.extend(mesh_ids.split(';'))

    # Remove duplicates and format MeSH IDs
    unique_mesh_ids = set(combined_mesh_ids)
    formatted_mesh_ids = [mesh_id.split('meshd')[-1] for mesh_id in unique_mesh_ids if 'meshd' in mesh_id]

    # Create a search term for treatments using combined MeSH IDs
    treatment_search = " OR ".join([f"MeSH:{mesh_id}" for mesh_id in formatted_mesh_ids])

    # Combine disease name and treatment search terms
    search_term = f"({disease_name}[Title/Abstract]) AND ({treatment_search})"

    # Perform the search
    handle = Entrez.esearch(db="pubmed", term=search_term, retmax=max_pmid_retrieve)
    record = Entrez.read(handle)
    handle.close()

    return record["IdList"]



In [18]:
disease_name = "sickle cell"
mesh_list_path = "../data/mesh_sets.tsv"
max_pmid_retrieve = 10

pmids = search_articles(disease_name, mesh_list_path, max_pmid_retrieve)
print(pmids)


['38326171', '38287736', '38013366', '37200963', '36927679', '36714120', '35283782', '35140852', '34966850', '33596221']
