In [15]:
import requests


def fetch_biorxiv_papers(
    start_date, end_date, server="biorxiv", cursor=0, category=None
):
    base_url = (
        f"https://api.biorxiv.org/details/{server}/{start_date}/{end_date}/{cursor}"
    )

    if category:
        base_url += f"?category={category.replace(' ', '_')}"

    print(f"Fetching: {base_url}")
    response = requests.get(base_url)
    response.raise_for_status()

    data = response.json()

    print(
        f"\nFound {len(data['collection'])} papers (total available: {data['messages'][0]['total']})\n"
    )

    for paper in data["collection"]:
        print("--------------")
        print(f"Title: {paper['title']}")
        print(f"DOI: {paper['doi']}")
        print(f"Authors: {paper['authors']}")
        print(f"Date: {paper['date']}")
        print(f"Version: {paper['version']}")
        print(f"Category: {paper['category']}")
        print(f"Abstract: {paper['abstract'][:200]}...\n")


In [None]:
fetch_biorxiv_papers("2024-01-01", "2024-01-07", category="cell biology")


Fetching: https://api.biorxiv.org/details/biorxiv/2024-01-01/2024-01-07/0?category=cell_biology

Found 59 papers (total available: 59)

--------------
Title: Small molecule modulation of a redox-sensitive stress granule protein dissolves stress granules with beneficial outcomes for familial amyotrophic lateral sclerosis models
DOI: 10.1101/721001
Authors: Uechi, H.; Sridharan, S.; Nijssen, J.; Bilstein, J.; Iglesias-Artola, J. M.; Kishigami, S.; Casablancas-Antras, V.; Poser, I.; Martinez, E. J.; Boczek, E.; Wagner, M.; Tomschke, N.; de Jesus Domingues, A. M.; Pal, A.; Doeleman, T.; Kour, S.; Anderson, E. N.; Stein, F.; Lee, H. O.; Zhang, X.; Fritsch, A. W.; Jahnel, M.; Fursch, J.; Murthy, A. C.; Alberti, S.; Bickle, M.; Fawzi, N. L.; Nadler, A.; David, D. C.; Pandey, U. B.; Hermann, A.; Stengel, F.; Davis, B. G.; Baldwin, A. J.; Savitski, M. M.; Hyman, A. A.; Wheeler, R. J.
Date: 2024-01-05
Version: 3
Category: cell biology
Abstract: Neurodegenerave diseases such as amyotrophic latera

##### An agent version of the tool can be as follow

Then with the html link the full paper can be scraped and have it as context for summarization and question/answering

In [15]:
from typing import Dict, List, Optional


def fetch_biorxiv_papers(
    start_date: str,
    end_date: str,
    category: Optional[str] = None,
    server: str = "biorxiv",
    cursor: int = 0,
    limit: Optional[int] = None,
) -> List[Dict]:
    """
    Fetch preprint metadata from bioRxiv or medRxiv within a date range and optional category.

    Args:
        start_date (str): Start date (YYYY-MM-DD).
        end_date (str): End date (YYYY-MM-DD).
        category (Optional[str]): Subject category like 'cell biology'.
        server (str): 'biorxiv' or 'medrxiv'.
        cursor (int): Start offset (in multiples of 100).
        limit (Optional[int]): Max number of papers to return.

    Returns:
        List[Dict]: List of papers (limited if requested).
    """
    base_url = (
        f"https://api.biorxiv.org/details/{server}/{start_date}/{end_date}/{cursor}"
    )

    if category:
        base_url += f"?category={category.replace(' ', '_')}"
    
    print(base_url)

    response = requests.get(base_url)
    response.raise_for_status()
    data = response.json()

    papers = [
        {
            "title": paper["title"],
            "doi": paper["doi"],
            "authors": paper["authors"],
            "date": paper["date"],
            "category": paper["category"],
            "abstract": paper["abstract"],
            "html_url": f"https://www.{server}.org/content/{paper['doi']}v{paper['version']}",
        }
        for paper in data["collection"]
    ]

    return papers[:limit] if limit else papers


In [16]:
papers_results = fetch_biorxiv_papers(
    "2025-05-01",
    "2025-05-07",
    "pharmacology%20and%20therapeutics",
    server="medrxiv"
    )


https://api.biorxiv.org/details/medrxiv/2025-05-01/2025-05-07/0?category=pharmacology%20and%20therapeutics


In [17]:
for p in papers_results:
    print(p["title"])
    print(p["authors"])
    print(p["date"])
    print(p["html_url"])
    print()


Generalizable AI predicts immunotherapy outcomes across cancers and treatments
SHEN, W.; Nguyen, T. H.; Li, M. M. R.; Huang, Y.; Moon, I.; Nair, N.; Marbach, D.; Zitnik, M.
2025-05-05
https://www.medrxiv.org/content/10.1101/2025.05.01.25326820v1

Effects of Cannabidiol on Social Relating, Anxiety, and Parental Stress in Autistic Children: A Randomised Controlled Crossover Trial
Parrella, N.-F.; Hill, A. T.; Enticott, P. G.; Botha, T.; Catchlove, S.; Downey, L.; Ford, T. C.
2025-05-07
https://www.medrxiv.org/content/10.1101/2024.06.19.24309024v3

Global Catastrophic Risks and Capacity to Manufacture Key Pharmaceuticals: Case Study for a Highly Trade-Dependent Nation
Wilson, N.; Wood, P.; Boyd, M.
2025-05-07
https://www.medrxiv.org/content/10.1101/2025.05.06.25327132v1

Targeted adaptive sampling enables clinical pharmacogenomics testing and genome-wide genotyping
Gan Hui Peng, P.; Han Lin, Y.; Irfan Bin Hajis, M.; Maulana, Y.; Ng Qi Hui, A.; Nathanael Ramanto, K.; Nisita Dewanggana, M.;

In [None]:
len(papers_results)


4

## Semantic Scholar

In [7]:
import requests
from typing import List, Dict, Optional
from datetime import datetime

def fetch_semanticscholar_papers(
    start_date: str,
    end_date: str,
    query: Optional[str] = None,
    cursor: int = 0,
    limit: Optional[int] = None,
) -> List[Dict]:
    """
    Fetch open-access paper metadata from Semantic Scholar within a date range and optional keyword query.

    Args:
        start_date (str): Start date (YYYY-MM-DD).
        end_date (str): End date (YYYY-MM-DD).
        query (Optional[str]): Keyword search query.
        cursor (int): Start offset.
        limit (Optional[int]): Max number of papers to return.

    Returns:
        List[Dict]: List of papers (limited if requested).
    """
    base_url = "https://api.semanticscholar.org/graph/v1/paper/search"

    start_year = datetime.strptime(start_date, "%Y-%m-%d").year
    end_year = datetime.strptime(end_date, "%Y-%m-%d").year

    params = {
        "query": query or "",
        "limit": 100,
        "offset": cursor,
        "fields": "title,authors,year,abstract,url,openAccessPdf,externalIds"
    }

    collected = []

    while True:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()

        for paper in data.get("data", []):
            if (
                paper.get("year") is not None and
                start_year <= paper["year"] <= end_year and
                paper.get("openAccessPdf") and
                paper["openAccessPdf"].get("url")
            ):
                doi = paper.get("externalIds", {}).get("DOI")
                collected.append({
                    "title": paper["title"],
                    "authors": ", ".join([a["name"] for a in paper.get("authors", [])]),
                    "date": str(paper["year"]),
                    "abstract": paper.get("abstract", "No abstract available."),
                    "html_url": paper["url"],
                    "pdf_url": paper["openAccessPdf"]["url"],
                    "doi": doi,
                    "doi_url": f"https://doi.org/{doi}" if doi else None,
                })

                if limit and len(collected) >= limit:
                    return collected[:limit]

        if not data.get("data") or len(data["data"]) < params["limit"]:
            break

        params["offset"] += params["limit"]

    return collected[:limit] if limit else collected



In [8]:
papers = fetch_semanticscholar_papers(
    start_date="2022-01-01",
    end_date="2022-01-09",
    query="transformer language models",
    limit=5
)

for p in papers:
    print(p["title"])
    print(p["authors"])
    print(p["date"])
    print(p["html_url"])
    print(p["doi_url"])


Transformer Language Models without Positional Encodings Still Learn Positional Information
Adi Haviv, Ori Ram, Ofir Press, Peter Izsak, Omer Levy
2022
https://www.semanticscholar.org/paper/a2fc77f075f666b462d9350e7576f0ba9845c61b
https://doi.org/10.48550/arXiv.2203.16634
Outlier Suppression: Pushing the Limit of Low-bit Transformer Language Models
Xiuying Wei, Yunchen Zhang, Xiangguo Zhang, Ruihao Gong, Shanghang Zhang, Qi Zhang, F. Yu, Xianglong Liu
2022
https://www.semanticscholar.org/paper/3f6243097a58e386aea1215fed4f372dee07a100
https://doi.org/10.48550/arXiv.2209.13325
Transformer Grammars: Augmenting Transformer Language Models with Syntactic Inductive Biases at Scale
Laurent Sartran, Samuel Barrett, A. Kuncoro, Milovs Stanojevi'c, Phil Blunsom, Chris Dyer
2022
https://www.semanticscholar.org/paper/024dbd9cf7f9c605cc3a99b35b578ce24993d32c
https://doi.org/10.1162/tacl_a_00526
Analyzing Encoded Concepts in Transformer Language Models
Hassan Sajjad, Nadir Durrani, Fahim Dalvi, Firo