In [1]:
import pandas as pd
from Bio import Entrez
import json
from datetime import datetime, timedelta

In [None]:
# Set your email address for the Entrez API
Entrez.email = "<email_address_removed>" #use your registered email address

def fetch_details(id_list):
    """
    Fetch details for a list of PubMed article IDs.

    Parameters:
        id_list (list): List of PubMed article IDs.

    Returns:
        dict: Details of the fetched articles.
    """
    ids = ','.join(id_list)
    handle = Entrez.efetch(db='pmc', id=ids, retmode='xml')
    papers = Entrez.read(handle)
    return papers

def generate_date_intervals(start_date, end_date, days):
    """
    Generate date intervals between a start and end date.

    Parameters:
        start_date (str): Start date in the format 'YYYY/MM/DD'.
        end_date (str): End date in the format 'YYYY/MM/DD'.
        days (int): Number of days for each interval.

    Returns:
        list: List of date intervals.
    """
    date_format = "%Y/%m/%d"
    start_datetime = datetime.strptime(start_date, date_format)
    end_datetime = datetime.strptime(end_date, date_format)
    current_datetime = start_datetime
    date_intervals = []

    while current_datetime < end_datetime:
        date_intervals.append(current_datetime.strftime(date_format))
        current_datetime += timedelta(days=days)

    date_intervals.append(end_datetime.strftime(date_format))
    return date_intervals

In [3]:

def extract_body_text(body_section):
    """
    Recursively extract text from 'body' or nested 'sec' elements in the PMC XML structure 
    returned by Entrez.
    """
    # This is a simplistic example: the structure can be deeply nested
    # with body['sec'] -> list of sections, each with paragraphs, etc.
    # Adjust to your doc structure

    text_content = []

    if isinstance(body_section, dict):
        # Possibly has 'p', 'sec' keys
        if 'p' in body_section:
            # 'p' could be a list of paragraphs
            paragraphs = body_section['p']
            if isinstance(paragraphs, list):
                for p in paragraphs:
                    # p might be string or dict if there's sub-structure
                    if isinstance(p, str):
                        text_content.append(p)
                    elif isinstance(p, dict):
                        text_content.append(str(p))
            else:
                # single paragraph
                text_content.append(str(paragraphs))

        if 'sec' in body_section:
            # 'sec' might be a list of sub-sections
            subsections = body_section['sec']
            if isinstance(subsections, list):
                for sec_el in subsections:
                    text_content.append(extract_body_text(sec_el))
            else:
                text_content.append(extract_body_text(subsections))
    elif isinstance(body_section, list):
        # body_section might be a list of sections
        for item in body_section:
            text_content.append(extract_body_text(item))
    elif isinstance(body_section, str):
        # direct string
        text_content.append(body_section)

    # Join all extracted text with spaces
    return " ".join([txt.strip() for txt in text_content if txt.strip()])

In [4]:
# Your search query parameters
search_query = "intelligence"
start_date = "2014/01/01"
end_date = "2024/12/31"
days_interval = 200

# Generate date intervals
intervals = generate_date_intervals(start_date, end_date, days_interval)

# Initialize lists to store data
data_list = []

chunk_size = 500

In [None]:
# Loop through date intervals
for interval_start, interval_end in zip(intervals[:-1], intervals[1:]):
    # Use esearch to get the list of IDs matching your criteria for the current interval
    search_handle = Entrez.esearch(db='pmc', term=search_query, mindate=interval_start, maxdate=interval_end, retmax=20000)
    search_results = Entrez.read(search_handle)
    search_handle.close()

    # Extract the list of IDs
    studies_id_list = search_results['IdList']

    # Fetch details in chunks
    for chunk_i in range(0, len(studies_id_list), chunk_size):
        chunk = studies_id_list[chunk_i:chunk_i + chunk_size]
        papers = fetch_details(chunk)
        for record in papers:
            try:
                metadata = record['OAI-PMH']['GetRecord']['record']['metadata']
                article = metadata['article']  # root of the article
                front = article['front']  # front matter
                body = article.get('body', {})  # main text sections
            except KeyError:
                continue
            data = {
                "title": {
                    "full_text": "",
                    "tokens": []
                },
                "authors": [],
                "affiliations": [],
                "identifiers": {},
                "journal": "",
                "language": "",
                "abstract": {
                    "full_text": "",
                    "tokens": []
                },
                "year": "",
                "month": "",
                "keywords": [],
                "full_text": "",
                "references": []
            }

            try:
                article_meta = front['article-meta']
                data["title"]["full_text"] = article_meta['title-group']['article-title']
                data["title"]["tokens"] = data["title"]["full_text"].split()
                for author in paper['MedlineCitation']['Article']['AuthorList']:
                    author_name = f"{author.get('LastName', '')}, {author.get('ForeName', '')}"
                    data["authors"].append(author_name)
            except:
                ...

            try:
                data["abstract"]["full_text"] = paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0].lower()
                data["abstract"]["tokens"] = data["abstract"]["full_text"].split()
            except:
                data["abstract"]["full_text"] = ''

            data["journal"] = paper['MedlineCitation']['Article']['Journal']['Title'].lower()
            data["language"] = paper['MedlineCitation']['Article']['Language'][0].lower()

            try:
                data["year"] = paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'].lower()
            except:
                data["year"] = ""

            try:
                data["month"] = paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'].lower()
            except:
                data["month"] = ""

            keywords = []

            if 'KeywordList' in paper['MedlineCitation']:
                for keyword_list in paper['MedlineCitation']['KeywordList']:
                    keywords.append(keyword_list)

            data["keywords"] = keywords

            references = []

            if 'PubmedData' in paper and 'ReferenceList' in paper['PubmedData']:
                for reference in paper['PubmedData']['ReferenceList']:
                    for citation in reference['Reference']:
                        references.append(citation['Citation'])

            data["references"] = references

            affiliations = []

            if 'AuthorList' in paper['MedlineCitation']['Article']:
                author_list = paper['MedlineCitation']['Article']['AuthorList']

                for author_info in author_list:
                    if 'AffiliationInfo' in author_info:
                        for affiliation_info in author_info['AffiliationInfo']:
                            affiliation = affiliation_info.get('Affiliation', '')
                            affiliations.append(affiliation)
                    else:
                        ...
                        # print(f"No affiliation information for paper {i}")

            data["affiliations"] = affiliations

            identifiers = {}

            if 'PubmedData' in paper and 'ArticleIdList' in paper['PubmedData']:
                for identifier in paper['PubmedData']['ArticleIdList']:
                    data["identifiers"][identifier.attributes['IdType']] = str(identifier)

            data_list.append(data)

In [8]:
# Save the data to a JSON file
with open('papersNew.json', 'w') as json_file:
    json.dump(data_list, json_file, indent=2)