# FETCHING DATA SETUP - PUBMED PROJECT

## 0. LIBRARIES

In [2]:
########################################################################################################################################################################################################################################
# 0. LIBRARIES #########################################################################################################################################################################################################################
########################################################################################################################################################################################################################################

In [4]:
# 0. Libraries
import requests
import time
import json
import os
from xml.etree import ElementTree as ET
from typing import List, Optional, Dict

## 1. SEARCH FUNCTION - connection with PubMed

In [6]:
########################################################################################################################################################################################################################################
# 1. Search Function ###################################################################################################################################################################################################################
########################################################################################################################################################################################################################################

In [9]:
# Function to fetch article IDs for a specific query declared later
def search_pubmed(query: str, api_key: str, max_results: int = 100, retstart: int = 0) -> dict:
    """
    Search PubMed using the specified query.

    Parameters:
    - query (str): The search query for PubMed (e.g., 'disease[TIAB] AND english[LA]').
    - api_key (str): Your PubMed API key for authenticated requests.
    - max_results (int): The maximum number of article IDs to retrieve in one request.
    - retstart (int): The starting point for the results to paginate through.

    Returns:
    - dict: JSON response containing PubMed IDs matching the query.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",              # Database to search (PubMed)
        "term": query,               # Search term/query string
        "retmax": max_results,       # Maximum number of results to retrieve
        "retstart": retstart,        # Starting index for results
        "retmode": "json",           # Output format (JSON in this case)
        "api_key": api_key           # API key for authentication
    }
    response = requests.get(base_url, params=params)
    response.raise_for_status()     # Raise an exception if the request fails
    return response.json()          # Return the JSON response

## 2. Fetch Details with Abstracts 

In [12]:
########################################################################################################################################################################################################################################
# 2. Fetch Details with Abstracts ######################################################################################################################################################################################################
########################################################################################################################################################################################################################################

In [14]:
def fetch_article_details_with_abstracts(
    article_ids: List[str],
    api_key: str,
    batch_size: int = 10,
) -> List[dict]:
    """
    Fetch detailed metadata, including abstracts, for a list of PubMed articles.

    Parameters:
    - article_ids (List[str]): List of PubMed IDs to retrieve details for.
    - api_key (str): Your PubMed API key for authenticated requests.
    - batch_size (int): Number of IDs to fetch in a single API request (max 200).

    Returns:
    - List[dict]: List of dictionaries containing article metadata, including abstracts.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    all_articles = []
    batch_size = min(batch_size, 200)  # Ensure batch size does not exceed API limits

    for i in range(0, len(article_ids), batch_size):
        batch_ids = article_ids[i:i + batch_size]
        params = {
            "db": "pubmed",
            "id": ",".join(batch_ids),
            "retmode": "xml",
            "rettype": "abstract",
            "api_key": api_key,
        }
        try:
            response = requests.get(base_url, params=params)
            response.raise_for_status()
            root = ET.fromstring(response.content)

            for article in root.findall(".//PubmedArticle"):
                # Extract fields
                pmid = article.findtext(".//PMID")
                title = article.findtext(".//ArticleTitle")
                journal = article.findtext(".//Journal/Title")
                pubdate_year = article.findtext(".//PubDate/Year")
                pubdate_month = article.findtext(".//PubDate/Month")
                pubdate_day = article.findtext(".//PubDate/Day")
                pubdate = f"{pubdate_year}-{pubdate_month or '01'}-{pubdate_day or '01'}"

                # Abstract sections -- MODIFIED SECTION
                abstract_sections = []
                for section in article.findall(".//AbstractText"):
                    # Use itertext() to include text from nested tags
                    section_text = "".join(section.itertext()).strip()
                    label = section.attrib.get("Label", "General")
                    nlm_category = section.attrib.get("NlmCategory")
                    abstract_sections.append({
                        "label": label,
                        "nlm_category": nlm_category,
                        "text": section_text,
                    })

                # Authors
                authors = [
                    {
                        "name": f"{author.findtext('LastName', '')} {author.findtext('ForeName', '')}".strip(),
                        "initials": author.findtext("Initials"),
                        "orcid": author.findtext(".//Identifier[@Source='ORCID']"),
                        "affiliation": author.findtext(".//AffiliationInfo/Affiliation"),
                    }
                    for author in article.findall(".//Author")
                ]

                # Mesh terms
                mesh_terms = [
                    {
                        "descriptor": heading.findtext("DescriptorName"),
                        "major_topic": heading.find("DescriptorName").attrib.get("MajorTopicYN") == "Y",
                        "qualifiers": [
                            qualifier.text for qualifier in heading.findall("QualifierName")
                        ],
                    }
                    for heading in article.findall(".//MeshHeading")
                ]

                # Keywords
                keywords = [keyword.text for keyword in article.findall(".//KeywordList/Keyword")]
                coi_statement = article.findtext(".//CoiStatement")

                # Append result
                all_articles.append({
                    "uid": pmid,
                    "title": title or "N/A",
                    "journal": journal or "N/A",
                    "pubdate": pubdate,
                    "abstract_sections": abstract_sections,
                    "authors": authors,
                    "mesh_terms": mesh_terms,
                    "keywords": keywords,
                    "coi_statement": coi_statement or "N/A",
                })

            print(f"Fetched batch of {len(batch_ids)} articles.")

        except requests.exceptions.RequestException as e:
            print(f"HTTP error fetching batch: {e}")
            continue
        except ET.ParseError as e:
            print(f"XML parsing error: {e}")
            continue

        time.sleep(0.1)  # Respect rate limits

    return all_articles

## 3. Save Data

In [17]:
########################################################################################################################################################################################################################################
# 3. Save Data #########################################################################################################################################################################################################################
########################################################################################################################################################################################################################################

In [19]:
def save_data(filename: str, data: dict) -> None:
    """
    Save data to a JSON file.

    Parameters:
    - filename (str): The name of the file to save the data to.
    - data (dict): The data to save (e.g., a list of dictionaries).
    """
    try:
        with open(filename, "w", encoding="utf-8") as file:
            json.dump(data, file, ensure_ascii=False, indent=4)
        print(f"Data successfully saved to {filename}.")
    except Exception as e:
        print(f"Error saving data to {filename}: {e}")

## 4. Load Progres

In [22]:
########################################################################################################################################################################################################################################
# 4. Load Progress #####################################################################################################################################################################################################################
########################################################################################################################################################################################################################################

In [24]:
def load_progress(progress_file: str) -> Dict:
    """
    Load progress from a JSON file.

    Parameters:
    - progress_file (str): The file to load progress from.

    Returns:
    - Dict: A dictionary containing progress data.
    """
    if os.path.exists(progress_file):
        with open(progress_file, "r", encoding="utf-8") as file:
            return json.load(file)
    
    # Return a default progress dictionary if the file does not exist
    return {
        "current_year": 0,
        "current_month": 0,
        "processed_batches": 0,
        "total_time_seconds": 0.0,
        "total_batches_processed": 0,
    }

## 5. Save Progress

In [27]:
########################################################################################################################################################################################################################################
# 5. Save Progress #####################################################################################################################################################################################################################
########################################################################################################################################################################################################################################

In [29]:
def save_progress(progress_file: str, progress: Dict) -> None:
    """
    Save progress to a JSON file.

    Parameters:
    - progress_file (str): The file to save progress to.
    - progress (Dict): A dictionary containing progress data.
    """
    with open(progress_file, "w", encoding="utf-8") as file:
        json.dump(progress, file, ensure_ascii=False, indent=4)
    print(f"Progress successfully saved to {progress_file}.")

## 6. Main Processing Function

In [32]:
########################################################################################################################################################################################################################################
## 6. Main Processing Function #########################################################################################################################################################################################################
########################################################################################################################################################################################################################################

In [34]:
def process_pubmed_by_month(
    api_key: str,
    base_query: str,
    start_year: int,
    end_year: int,
    progress_file: str,
    start_month: int = 1,
    end_month: Optional[int] = 12,
):
    """
    Main function to process PubMed articles by year and month.
    """
    progress = load_progress(progress_file)
    batch_size = 200  # Number of articles to fetch per batch
    delay_between_requests = 1  # Delay to respect rate limits

    for year in range(start_year, end_year + 1):
        # Set months range for the year
        month_start = start_month if year == start_year else 1
        month_end = end_month if year == end_year else 12

        for month in range(month_start, month_end + 1):
            # Skip already processed months
            if (year < progress.get("current_year", year)) or (
                year == progress.get("current_year", year) and month <= progress.get("current_month", 0)
            ):
                continue

            print(f"Processing {year}-{month:02d}...")
            query = f"{base_query} AND {year}/{month:02d}[PDAT]"

            # Step 1: Retrieve all article IDs with pagination
            all_article_ids = []
            retstart = 0
            while True:
                try:
                    search_results = search_pubmed(query, api_key, max_results=10000, retstart=retstart)
                    article_ids = search_results.get("esearchresult", {}).get("idlist", [])
                    all_article_ids.extend(article_ids)

                    total_results = int(search_results.get("esearchresult", {}).get("count", 0))
                    if len(all_article_ids) >= total_results or not article_ids:
                        break

                    retstart += 10000
                    time.sleep(delay_between_requests)
                except Exception as e:
                    print(f"Error retrieving articles: {e}")
                    save_progress(progress_file, progress)
                    return

            total_articles = len(all_article_ids)
            print(f"Total articles found for {year}-{month:02d}: {total_articles}")

            # Step 2: Fetch articles in batches
            for i in range(progress.get("processed_batches", 0), len(all_article_ids), batch_size):
                batch_ids = all_article_ids[i:i + batch_size]
                start_time = time.time()
                retry_count = 0
                success = False

                while retry_count < 5 and not success:
                    try:
                        articles = fetch_article_details_with_abstracts(batch_ids, api_key)

                        # Save batch results
                        output_file = f"data/results/results_{year}_{month:02d}.json"
                        os.makedirs(os.path.dirname(output_file), exist_ok=True)
                        if os.path.exists(output_file):
                            with open(output_file, "r", encoding="utf-8") as file:
                                existing_data = json.load(file)
                            articles = existing_data + articles

                        save_data(output_file, articles)

                        # Adjust processed_so_far to avoid over-counting on the last batch
                        processed_so_far = min(i + len(batch_ids), total_articles)
                        percentage_done = (processed_so_far / total_articles) * 100

                        print(
                            f"Processed batch of {len(batch_ids)} articles "
                            f"({percentage_done:.2f}% of {total_articles} articles for {year}-{month:02d}) "
                            f"in {time.time() - start_time:.2f} seconds."
                        )

                        # Update progress
                        progress["current_year"] = year
                        progress["current_month"] = month
                        progress["processed_batches"] = i + len(batch_ids)
                        progress["total_time_seconds"] = progress.get("total_time_seconds", 0.0) + (time.time() - start_time)
                        progress["total_batches_processed"] = progress.get("total_batches_processed", 0) + 1
                        save_progress(progress_file, progress)
                        success = True

                    except requests.exceptions.HTTPError as e:
                        retry_count += 1
                        print(f"Retry {retry_count} for batch {i}: {e}")
                        time.sleep(10)  # Wait before retrying

                if not success:
                    error_file = f"data/errors/error_batch_{year}_{month:02d}_{i}.json"
                    os.makedirs(os.path.dirname(error_file), exist_ok=True)
                    save_data(error_file, batch_ids)
                    print(f"Failed batch saved to {error_file}")
                    break

            # Reset batch progress after processing the month
            progress["processed_batches"] = 0
            save_progress(progress_file, progress)

    # Print average time per batch at the end
    if progress.get("total_batches_processed", 0) > 0:
        average_time = progress["total_time_seconds"] / progress["total_batches_processed"]
        print(f"Average time per batch: {average_time:.2f} seconds")
    print("Processing complete.")

## USAGE EXAMPLE

In [37]:
########################################################################################################################################################################################################################################
# Usage Example ########################################################################################################################################################################################################################
########################################################################################################################################################################################################################################

In [112]:
if __name__ == "__main__":
    api_key = "1f75d70f15ce7eeaf067a5e84e10ac2f3309" #YOUR_API_KEY_HERE
    base_query = '''(
("disease"[All Fields])
OR ("diseases"[All Fields])
OR ("illness"[All Fields]) 
OR ("illnesses"[All Fields])
OR ("health problem"[All Fields])
OR ("health problems"[All Fields]) 
AND (english[Language]) 
AND (USA[Affiliation] OR US[Affiliation]) 
AND (hasabstract[text]) 
AND (humans[Filter])
)'''
    process_pubmed_by_month(api_key, base_query, start_year=1994, end_year=2000, start_month=1, end_month=12, progress_file="data/progress.json")

Processing 1994-01...
Total articles found for 1994-01: 264
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Data successfully saved to data/results/results_1994_01.json.
Processed batch of 200 articles (75.76% of 264 articles for 1994-01) in 16.07 seconds.
Progress successfully saved to data/progress.json.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 

In [114]:
if __name__ == "__main__":
    api_key = "1f75d70f15ce7eeaf067a5e84e10ac2f3309" #YOUR_API_KEY_HERE
    base_query = '''(
("disease"[All Fields])
OR ("diseases"[All Fields])
OR ("illness"[All Fields]) 
OR ("illnesses"[All Fields])
OR ("health problem"[All Fields])
OR ("health problems"[All Fields]) 
AND (english[Language]) 
AND (USA[Affiliation] OR US[Affiliation]) 
AND (hasabstract[text]) 
AND (humans[Filter])
)'''
    process_pubmed_by_month(api_key, base_query, start_year=2001, end_year=2012, start_month=1, end_month=12, progress_file="data/progress.json")

Processing 2001-01...
Total articles found for 2001-01: 4127
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Data successfully saved to data/results/results_2001_01.json.
Processed batch of 200 articles (4.85% of 4127 articles for 2001-01) in 16.52 seconds.
Progress successfully saved to data/progress.json.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10

In [119]:
if __name__ == "__main__":
    api_key = "1f75d70f15ce7eeaf067a5e84e10ac2f3309" #YOUR_API_KEY_HERE
    base_query = '''(
("disease"[All Fields])
OR ("diseases"[All Fields])
OR ("illness"[All Fields]) 
OR ("illnesses"[All Fields])
OR ("health problem"[All Fields])
OR ("health problems"[All Fields]) 
AND (english[Language]) 
AND (USA[Affiliation] OR US[Affiliation]) 
AND (hasabstract[text]) 
AND (humans[Filter])
)'''
    process_pubmed_by_month(api_key, base_query, start_year=2014, end_year=2018, start_month=1, end_month=12, progress_file="data/progress.json")

Processing 2014-01...
Total articles found for 2014-01: 6299
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Data successfully saved to data/results/results_2014_01.json.
Processed batch of 200 articles (3.18% of 6299 articles for 2014-01) in 19.97 seconds.
Progress successfully saved to data/progress.json.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10

In [None]:
if __name__ == "__main__":
    api_key = "1f75d70f15ce7eeaf067a5e84e10ac2f3309" #YOUR_API_KEY_HERE
    base_query = '''(
("disease"[All Fields])
OR ("diseases"[All Fields])
OR ("illness"[All Fields]) 
OR ("illnesses"[All Fields])
OR ("health problem"[All Fields])
OR ("health problems"[All Fields]) 
AND (english[Language]) 
AND (USA[Affiliation] OR US[Affiliation]) 
AND (hasabstract[text]) 
AND (humans[Filter])
)'''
    process_pubmed_by_month(api_key, base_query, start_year=2019, end_year=2024, start_month=1, end_month=12, progress_file="data/progress.json")

Processing 2019-01...
Total articles found for 2019-01: 7634
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Data successfully saved to data/results/results_2019_01.json.
Processed batch of 200 articles (2.62% of 7634 articles for 2019-01) in 20.82 seconds.
Progress successfully saved to data/progress.json.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10 articles.
Fetched batch of 10

# HELP FUNCTIONS AND CHECKING

### XML EXAMPLES

In [73]:
def fetch_article_xml(api_key: str, article_id: str):
    """
    Fetch and display XML data for a single PubMed article.

    Parameters:
    - api_key (str): Your PubMed API key.
    - article_id (str): The PubMed ID of the article to fetch.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pubmed",
        "id": article_id,
        "retmode": "xml",
        "rettype": "abstract",
        "api_key": api_key,
    }
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    xml_data = response.text
    print(xml_data)  # Display the XML content

# Example Usage
if __name__ == "__main__":
    api_key = "1f75d70f15ce7eeaf067a5e84e10ac2f3309"
    fetch_article_xml(api_key, "12921495")  # Replace with a real PubMed ID #38046870 39666377 does not work #39666263 did work 14564135


<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2024//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_240101.dtd">
<PubmedArticleSet>
<PubmedArticle><MedlineCitation Status="MEDLINE" Owner="NLM"><PMID Version="1">12921495</PMID><DateCompleted><Year>2003</Year><Month>11</Month><Day>06</Day></DateCompleted><DateRevised><Year>2015</Year><Month>11</Month><Day>19</Day></DateRevised><Article PubModel="Print"><Journal><ISSN IssnType="Print">1060-0280</ISSN><JournalIssue CitedMedium="Print"><Volume>37</Volume><Issue>9</Issue><PubDate><Year>2003</Year><Month>Sep</Month></PubDate></JournalIssue><Title>The Annals of pharmacotherapy</Title><ISOAbbreviation>Ann Pharmacother</ISOAbbreviation></Journal><ArticleTitle>Baclofen treatment for chronic posttraumatic stress disorder.</ArticleTitle><Pagination><StartPage>1177</StartPage><EndPage>1181</EndPage><MedlinePgn>1177-81</MedlinePgn></Pagination><Abstract><AbstractText Label="OBJECTIVE" NlmCategory