In [None]:
import time
import json
from datetime import datetime, timedelta
from Bio import Entrez

Entrez.email = "<email_adress_removed>"  # Required by NCBI

In [20]:
def month_year_iter(start_year, start_month, end_year, end_month):
    """
    Generate (year, month) pairs from (start_year, start_month) up to (end_year, end_month).
    """
    ym_start = 12 * start_year + start_month - 1
    ym_end = 12 * end_year + end_month - 1
    for ym in range(ym_start, ym_end + 1):
        y, m = divmod(ym, 12)
        yield y, m + 1

def esearch_month(year, month, extra_query):
    """
    Perform an ESearch for all articles published in a given month of a given year,
    also matching extra_query. Returns the list of PMIDs (as strings).
    """
    # Build the range for this specific month: [YYYY/MM/01 - YYYY/MM/LastDay]
    start_date = datetime(year, month, 1)
    if month == 12:
        end_date = datetime(year + 1, 1, 1) - timedelta(days=1)
    else:
        end_date = datetime(year, month + 1, 1) - timedelta(days=1)

    # Create the date filter part
    date_part = f'("{start_date.strftime("%Y/%m/%d")}"[PDAT] : "{end_date.strftime("%Y/%m/%d")}"[PDAT])'
    
    # Combine the date filter with the extra search query, e.g. intelligence[TIAB]
    # NOTE: you could also do intelligence[All Fields], or use parentheses for more complex Boolean logic.
    query = f"({date_part}) AND ({extra_query})"

    print(f"Searching PubMed for {query}...")

    # ESearch
    search_handle = Entrez.esearch(
        db="pubmed",
        term=query,
        retmax=9999,      # ESearch can only fetch up to 9999
        usehistory="y"    # Use history for chunked retrieval
    )
    search_results = Entrez.read(search_handle)
    search_handle.close()

    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"]
    count = int(search_results["Count"])
    print(f" Found {count} articles for {year}-{month:02d} matching query: {extra_query}")

    # Retrieve PMIDs in increments (up to 9999)
    pmid_list = []
    batch_size = 1000
    for start in range(0, min(count, 9999), batch_size):
        fetch_handle = Entrez.esearch(
            db="pubmed",
            webenv=webenv,
            query_key=query_key,
            term=query,
            retstart=start,
            retmax=batch_size
        )
        data = Entrez.read(fetch_handle)
        fetch_handle.close()
        pmid_list.extend(data["IdList"])
        time.sleep(0.3)  # courtesy delay

    return pmid_list

def fetch_pubmed_metadata(pmid_list):
    """
    Given a list of PMIDs, use EFetch to retrieve PubMed metadata in smaller batches.
    Returns a list of dicts with {pmid, title, abstract, etc.}
    """
    results = []
    batch_size = 500
    for i in range(0, len(pmid_list), batch_size):
        batch_pmids = pmid_list[i : i+batch_size]
        id_string = ",".join(batch_pmids)

        handle = Entrez.efetch(db="pubmed", id=id_string, retmode="xml")
        records = Entrez.read(handle)
        handle.close()

        for record in records.get("PubmedArticle", []):
            pmid = record["MedlineCitation"]["PMID"]
            article = record["MedlineCitation"].get("Article", {})
            title = article.get("ArticleTitle", "")
            abstract_text = ""
            if "Abstract" in article and "AbstractText" in article["Abstract"]:
                abs_parts = article["Abstract"]["AbstractText"]
                if isinstance(abs_parts, list):
                    abstract_text = " ".join(str(part) for part in abs_parts)
                else:
                    abstract_text = str(abs_parts)
            
            results.append({
                "pmid": pmid,
                "title": title,
                "abstract": abstract_text
            })

        time.sleep(0.3)  # courtesy delay
    return results


def fetch_pmc_id(pmid):
    """
    Check if a PubMed article is in PMC (returns 'PMC########' if yes, else None).
    """
    try:
        link_handle = Entrez.elink(dbfrom="pubmed", id=pmid, linkname="pubmed_pmc")
        link_result = Entrez.read(link_handle)
        link_handle.close()
    except Exception as e:
        print(f"Error retrieving PMC link for PMID {pmid}: {e}")
        return None

    pmcid = None
    for linkset in link_result:
        if "LinkSetDb" in linkset:
            for linksetdb in linkset["LinkSetDb"]:
                if linksetdb["LinkName"] == "pubmed_pmc":
                    for link in linksetdb["Link"]:
                        pmcid_num = link["Id"]
                        pmcid = f"PMC{pmcid_num}"
                        break
    return pmcid

def fetch_pmc_full_text(pmcid):
    """
    Fetch full text from PMC (if open access).
    Returns a string with the text, or None if unavailable.
    """
    try:
        handle = Entrez.efetch(db="pmc", id=pmcid, retmode="xml", rettype="full")
        xml_data = handle.read().decode("utf-8")
        handle.close()
    except Exception as e:
        print(f"Error fetching from PMC ID {pmcid}: {e}")
        return None

    # A naive example that extracts lines between <body>...</body>
    in_body_section = False
    full_text_lines = []
    for line in xml_data.splitlines():
        if "<body" in line:
            in_body_section = True
        elif "</body>" in line:
            in_body_section = False
            full_text_lines.append(line)
        if in_body_section:
            full_text_lines.append(line)

    return "\n".join(full_text_lines) if full_text_lines else None


In [21]:
extra_query = "intelligence[TIAB]"  # or "intelligence[All Fields]", etc.

all_records = []

In [22]:
for (year, month) in month_year_iter(2010, 1, 2024, 12):
    pmids_for_month = esearch_month(year, month, extra_query=extra_query)
    print(f"Found {pmids_for_month.count} articles in PubMed for query {extra_query} in {year}-{month:02d}")
    if not pmids_for_month:
        continue
    all_records=[]

    for (year, month) in month_year_iter(2010, 1, 2024, 12):
        pmids_for_month = esearch_month(year, month, extra_query=extra_query)
        if not pmids_for_month:
            continue
        
        monthly_records = fetch_pubmed_metadata(pmids_for_month)

        # If you want, do the PMC / full-text retrieval here ...
        for rec in monthly_records:
            rec["pmc_id"] = fetch_pmc_id(rec["pmid"])
            if rec["pmc_id"]:
                rec["full_text"] = fetch_pmc_full_text(rec["pmc_id"])
            else:
                rec["full_text"] = None
            time.sleep(0.3)

        all_records.extend(monthly_records)
        print(f" > Accumulated total records so far: {len(all_records)}")

        # Save partial results every month
        with open("pubmed_2010_2024_intelligence.json", "a", encoding="utf-8") as f:
            for r in monthly_records:
                f.write(json.dumps(r, ensure_ascii=False) + "\n")

    print("All done!")
    print(f"Total articles in final data: {len(all_records)}")


Searching PubMed for (("2010/01/01"[PDAT] : "2010/01/31"[PDAT])) AND (intelligence[TIAB])...
 Found 222 articles for 2010-01 matching query: intelligence[TIAB]
Found <built-in method count of list object at 0x10dc171c0> articles in PubMed for query intelligence[TIAB] in 2010-01
Searching PubMed for (("2010/01/01"[PDAT] : "2010/01/31"[PDAT])) AND (intelligence[TIAB])...
 Found 222 articles for 2010-01 matching query: intelligence[TIAB]
 > Accumulated total records so far: 222
Searching PubMed for (("2010/02/01"[PDAT] : "2010/02/28"[PDAT])) AND (intelligence[TIAB])...
 Found 95 articles for 2010-02 matching query: intelligence[TIAB]
 > Accumulated total records so far: 317
Searching PubMed for (("2010/03/01"[PDAT] : "2010/03/31"[PDAT])) AND (intelligence[TIAB])...
 Found 141 articles for 2010-03 matching query: intelligence[TIAB]
 > Accumulated total records so far: 458
Searching PubMed for (("2010/04/01"[PDAT] : "2010/04/30"[PDAT])) AND (intelligence[TIAB])...
 Found 101 articles for 20

KeyboardInterrupt: 