## Step 1: Retrive the list of PubmedID and PMC IDs about a particular Disease 

### Use Entrez to Search PubMed and Fetch Details ( Using E-utilities API.)

In [2]:
from Bio import Entrez
import requests
import json
import requests
import json
import os
import time  

# Always provide your email when using NCBI's API
Entrez.email = "enock.niyonkuru@jax.org"



In [3]:

def search_articles(disease_name, max_results=200):
    """
    Search articles related to a disease and return their PubMed IDs.
    """
    search_term = f"{disease_name}[Title/Abstract]"
    handle = Entrez.esearch(db="pubmed", term=search_term, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

def fetch_pmc_ids(pmids_list):
    """
    Fetch PMC IDs corresponding to a list of PubMed IDs, and return three lists:
    - pmids_without_pmc: PubMed IDs without corresponding PMC IDs
    - pmc_ids_list: PMC IDs in the format 'PMC12345'
    - pmids_with_pmc: PubMed IDs with corresponding PMC IDs
    """
    links = Entrez.elink(dbfrom="pubmed", db="pmc", id=pmids_list, linkname="pubmed_pmc")
    records = Entrez.read(links)
    links.close()
    
    pmc_ids_list = []    
    for record in records:
        pmid = record['IdList'][0]
        try:
            pmc_id = 'PMC' + record['LinkSetDb'][0]['Link'][0]['Id']
            pmc_ids_list.append(pmc_id)
        except IndexError:  # This PMID does not have a corresponding PMC ID
            continue
    
    # Convert sets to lists for the return value
    return  pmc_ids_list


In [4]:
# Example usage
disease_name = "Sickle Cell Anemia"
pmids_list = search_articles(disease_name)
pmc_ids_list = fetch_pmc_ids(pmids_list)


In [5]:
print("Length of PM IDs:", len(pmids_list))
print("PM IDs:", pmids_list[:5])

print("Length of PMC IDs:", len(pmc_ids_list))
print("PMC IDs:", pmc_ids_list[:5])



Length of PM IDs: 200
PM IDs: ['38344818', '38336101', '38335412', '38332651', '38326178']
Length of PMC IDs: 113
PMC IDs: ['PMC10834466', 'PMC10838028', 'PMC10825386', 'PMC10807523', 'PMC10809526']


## Step 2: Use Pubtator API to get the Annotated Texts for the selected IDs 

#### Abstract + Title  Text 

In [6]:
def fetch_and_save_abst_json(pmids_list, base_path, max_articles_to_save):
    """
    Fetches full texts for given PMC IDs in biocjson format and saves them as JSON files in the specified folder.
    Tracks the count of articles not found and successfully retrieved. Stops saving articles once a specified limit is reached.
    
    Parameters:
    - pmids_list (list of str): List of PMC IDs.
    - base_path (str): The base directory path where the files will be saved.
    - max_articles_to_save (int): Maximum number of articles to save before ending the function.
    """
    not_found_count = 0
    found_count = 0

    for pmid in pmids_list:
        if found_count >= max_articles_to_save:
            break  # End the loop if the maximum number of articles to save has been reached
        
        api_url = f"https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/export/biocjson?pmids={pmid}"
        try_again = True  # Flag to control the retry mechanism
        while try_again:
            response = requests.get(api_url)
            if response.status_code == 200 and response.text.strip():
                # Check if the response is not empty
                try:
                    data = response.json()
                    if data:  # Additional check to ensure JSON is not empty
                        filename = f"{pmid}.json"
                        full_path = os.path.join(base_path, filename)
                        with open(full_path, 'w', encoding='utf-8') as file:
                            json.dump(data, file, indent=4)
                        print(f"Saved full text for PM ID {pmid} in '{full_path}'.")
                        found_count += 1
                    else:
                        print(f"No content for PM ID {pmid}.")
                        not_found_count += 1
                    try_again = False  # No need to retry
                except json.JSONDecodeError:
                    print(f"Failed to decode JSON for PM ID {pmid}.")
                    not_found_count += 1
                    try_again = False  # No need to retry

            elif response.status_code == 429:
                # If HTTP status code is 429, wait for 3 seconds and retry once
                print(f"Rate limit exceeded for PM ID {pmid}. Waiting 3 seconds before retrying...")
                time.sleep(3)  # Wait for 3 seconds
                # After waiting, the loop will try the request again
            else:
                print(f"Failed to fetch full text for PM ID {pmid}: HTTP {response.status_code}")
                not_found_count += 1
                try_again = False  # No need to retry

    print(f"Total articles found and saved: {found_count}")
    print(f"Total articles not found: {not_found_count}")

#### Full Text

In [7]:
def fetch_and_save_fulltext_json(pmc_ids_list, base_path, max_articles_to_save):
    """
    Fetches full texts for given PMC IDs in biocjson format and saves them as JSON files in the specified folder.
    Tracks the count of articles not found and successfully retrieved. Stops saving articles once a specified limit is reached.
    
    Parameters:
    - pmcids (list of str): List of PMC IDs.
    - base_path (str): The base directory path where the files will be saved.
    - max_articles_to_save (int): Maximum number of articles to save before ending the function.
    """
    not_found_count = 0
    found_count = 0

    for pmcid in pmc_ids_list:
        if found_count >= max_articles_to_save:
            print(f"Reached the maximum limit of {max_articles_to_save} articles to save.")
            break  # Stop the function if the maximum number of articles to save has been reached

        api_url = f"https://www.ncbi.nlm.nih.gov/research/pubtator3-api/publications/pmc_export/biocjson?pmcids={pmcid}"
        try_again = True  # Flag to control the retry mechanism
        while try_again:
            response = requests.get(api_url)
            if response.status_code == 200 and response.text.strip():
                # Check if the response is not empty
                try:
                    data = response.json()
                    if data:  # Additional check to ensure JSON is not empty
                        filename = f"{pmcid}.json"
                        full_path = os.path.join(base_path, filename)
                        with open(full_path, 'w', encoding='utf-8') as file:
                            json.dump(data, file, indent=4)
                        print(f"Saved full text for PMC ID {pmcid} in '{full_path}'.")
                        found_count += 1
                    else:
                        print(f"No content for PMC ID {pmcid}.")
                        not_found_count += 1
                    try_again = False  # No need to retry
                except json.JSONDecodeError:
                    print(f"Failed to decode JSON for PMC ID {pmcid}.")
                    not_found_count += 1
                    try_again = False  # No need to retry

            elif response.status_code == 429:
                # If HTTP status code is 429, wait for 3 seconds and retry once
                print(f"Rate limit exceeded for PMC ID {pmcid}. Waiting 3 seconds before retrying...")
                time.sleep(3)  # Wait for 3 seconds
                # After waiting, the loop will try the request again
            else:
                print(f"Failed to fetch full text for PMC ID {pmcid}: HTTP {response.status_code}")
                not_found_count += 1
                try_again = False  # No need to retry

    print(f"Total articles found and saved: {found_count}")
    print(f"Total articles not found: {not_found_count}")


In [8]:
if __name__ == "__main__":
    base_path = "json_files"
    os.makedirs(base_path, exist_ok=True)
    pmids_list = pmids_list
    pmc_ids_list = pmc_ids_list
    max_articles_to_save = 50
    fetch_and_save_abst_json(pmids_list, base_path, max_articles_to_save)
    fetch_and_save_fulltext_json(pmc_ids_list, base_path, max_articles_to_save)


Failed to fetch full text for PM ID 38344818: HTTP 200
Failed to fetch full text for PM ID 38336101: HTTP 200
Failed to fetch full text for PM ID 38335412: HTTP 200
Failed to fetch full text for PM ID 38332651: HTTP 200
Failed to fetch full text for PM ID 38326178: HTTP 200
Failed to fetch full text for PM ID 38323371: HTTP 200
Rate limit exceeded for PM ID 38319780. Waiting 3 seconds before retrying...
Failed to fetch full text for PM ID 38319780: HTTP 200
Failed to fetch full text for PM ID 38312689: HTTP 200
Failed to fetch full text for PM ID 38310012: HTTP 200
Rate limit exceeded for PM ID 38308541. Waiting 3 seconds before retrying...
Failed to fetch full text for PM ID 38308541: HTTP 200
Failed to fetch full text for PM ID 38307830: HTTP 200
Failed to fetch full text for PM ID 38302294: HTTP 200
Rate limit exceeded for PM ID 38292987. Waiting 3 seconds before retrying...
Failed to fetch full text for PM ID 38292987: HTTP 200
Failed to fetch full text for PM ID 38289232: HTTP 200