In [10]:
%load_ext dotenv
%dotenv ../.env


The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


## Question 1: How much data

Search 1: A simple, broad search of literature regarding hydrophobicity in plant leaves.

`Hydrophobicity AND plant leaves`



In [11]:
import os
import requests
import pandas as pd
import random
import pathlib
import re
import time
from PyPDF2 import PdfReader
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("API_KEY")


query_string = 'Hydrophobi*+"plant+leaves"'
encoded_query = query_string  # → Hydrophobicity+AND+%22plant+leaves%22

RESOURCE_URL = "https://api.elsevier.com/"
SEARCH_URL = f"{RESOURCE_URL}/content/search/scopus"
ARTICLE_URL = f"{RESOURCE_URL}/content/article"

headers = {
    "X-ELS-APIKey": api_key,
    "Accept": "application/json",
}
params_counting_only = {
    "query": encoded_query,
    "count": 0,
}

# Hit count
r_count = requests.get(SEARCH_URL, headers=headers, params=params_counting_only)
r_count.raise_for_status()

total_hits = int(r_count.json()["search-results"]["opensearch:totalResults"])

print(f"Total hits for {encoded_query}: {total_hits}")

LIMIT = 25

params = {
    "query": encoded_query,
    "count": LIMIT,
}

resp = requests.get(SEARCH_URL, headers=headers, params=params)

if resp.status_code != 200:
    raise RuntimeError(f"Scopus API error {resp.status_code}: {resp.text}")

entries = resp.json().get("search-results", {}).get("entry", [])
records = [
    {
        "Title": e.get("dc:title"),
        "Authors": e.get("dc:creator"),
        "Year": e.get("prism:coverDate", "").split("-")[0],
        "URL": e.get("prism:url"),
    }
    for e in entries
]
df = pd.DataFrame(records)
print(f"Returned {len(df)} documents")
display(df)  # in a notebook this renders a nice HTML table
# df.to_csv("scopus_results.csv", index=False)   # optional persist

Total hits for Hydrophobi*+"plant+leaves": 6219
Returned 25 documents


Unnamed: 0,Title,Authors,Year,URL
0,The suberin transporter StABCG1 is required fo...,Benatto Perino E.H.,2025,https://api.elsevier.com/content/abstract/scop...
1,Combining plant extracts and hot water treatme...,Worku S.,2025,https://api.elsevier.com/content/abstract/scop...
2,Sustainable pest management using plant second...,Zhang X.,2025,https://api.elsevier.com/content/abstract/scop...
3,Plant Cell Wall-Like Soft Materials: Micro- an...,Koshani R.,2025,https://api.elsevier.com/content/abstract/scop...
4,Anti-proliferative and photodynamic activities...,Olofinsan K.A.,2025,https://api.elsevier.com/content/abstract/scop...
5,Optimization of betulinic and ursolic acids an...,Demiray H.,2025,https://api.elsevier.com/content/abstract/scop...
6,De novo assembly of the mitochondrial genome o...,Zhou G.,2025,https://api.elsevier.com/content/abstract/scop...
7,Discovery of potent anti-toxoplasmosis drugs f...,Mohammed M.M.D.,2025,https://api.elsevier.com/content/abstract/scop...
8,Withania coagulans-mediated green synthesis of...,Khan A.,2025,https://api.elsevier.com/content/abstract/scop...
9,The nano-paradox: addressing nanotoxicity for ...,Rajpal V.R.,2025,https://api.elsevier.com/content/abstract/scop...


Here we have a good first look at some very coarse-grained results. Let's go in and use random offsets to retrieve 25 random documents.

In [12]:
from requests.adapters import HTTPAdapter, Retry


# ------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------
load_dotenv()
API_KEY = os.getenv("API_KEY")  # ← your Elsevier key
EMAIL = os.getenv("EMAIL")  # ← required by Unpaywall
DL_DIR = pathlib.Path("../data/downloads")
DL_DIR.mkdir(exist_ok=True)

TARGET = 25  # how many full-text docs we want
SEARCH_PAGE_SZ = 25  # pull 25 hits per random page
MAX_OFFSET = 5000 - SEARCH_PAGE_SZ  # Scopus hard cap with start=

hdrs_core = {"X-ELS-APIKey": API_KEY}
hdrs_xml = {**hdrs_core, "Accept": "text/xml"}
hdrs_pdf = {**hdrs_core, "Accept": "application/pdf"}


def log(msg):
    print(time.strftime("%H:%M:%S"), msg)


# ------------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------------
def extract_data(entry: dict) -> dict:
    """Return data (if present) from a search entry."""
    eid = entry.get("eid") or entry.get("dc:identifier", "").split(":")[-1]
    return {
        "eid": eid,
        "doi": entry.get("prism:doi"),
        "pii": entry.get("pii"),
        "title": entry.get("dc:title", "")[:80] + "…",
        "authors": entry.get("dc:creator"),
        "journal": entry.get("prism:publicationName"),
        "year": entry.get("prism:coverDate", "").split("-")[0],
    }


if PdfReader:
    def is_multi_page(path: pathlib.Path) -> bool:    # ← NEW
        try:
            return len(PdfReader(str(path)).pages) > 1
        except Exception:
            return False
else:                                                 # ← NEW fallback
    def is_multi_page(path: pathlib.Path) -> bool:    # ← NEW
        return path.stat().st_size > 400_000          # ~= 390 KiB


# ------------------------------------------------------------------
#  robust Session  (re-use everywhere)
# ------------------------------------------------------------------
S = requests.Session()
retries = Retry(
    total=4,  # max 4 attempts
    backoff_factor=0.4,  # 0.4, 0.8, 1.6, 3.2 s
    status_forcelist=[502, 503, 504, 429],
    allowed_methods=["GET"],
)
S.mount("https://", HTTPAdapter(max_retries=retries))
S.headers.update({"X-ELS-APIKey": API_KEY})  # applied to every request

# dedicated headers for full-text
HDR_XML = {"Accept": "text/xml"}
HDR_PDF = {"Accept": "application/pdf"}

DELAY = 0.3  # polite pause after every outgoing request


# ------------------------------------------------------------------
def save_article(url: str, path: pathlib.Path, hdr: dict) -> bool:
    """Try to download url → path (with retries); return True if saved."""
    try:
        r = S.get(url, headers=hdr, stream=True, timeout=30, allow_redirects=True)
        time.sleep(DELAY)
        if r.status_code == 200:
            with path.open("wb") as fh:
                for chunk in r.iter_content(8192):
                    fh.write(chunk)
            return True
    except requests.RequestException as exc:
        log(f"      …download error ({exc.__class__.__name__}) – skipped")
    return False


# ------------------------------------------------------------------
FULL_TEXT_MARKER = re.compile(rb"<ce:section\b", re.I)

# ----------------------------------------------------------------------
# Article-Retrieval first, generic fall-backs after that
# ----------------------------------------------------------------------
FULL_TEXT_MARKER = re.compile(rb"<ce:section\b", re.I)   # body present?

def try_sciencedirect(ids):
    """
    Return pathlib.Path to saved full-article payload or None.

    1. Article-Retrieval PDF  (view=ENTITLED)                 ← publisher PDF
    2. Article-Retrieval PDF  (view=ENTITLED&amsRedirect=true)← author manuscript
    3. Article-Retrieval XML (view=FULL)
    4. legacy publisher-PDF fall-back (unchanged)
    """
    for key in ("doi", "pii", "eid"):          # order of preference
        value = ids.get(key)
        if not value:
            continue

        # 1) publisher PDF
        log(f"API-ENTITLED → {key}:{value}")
        pdf_url  = f"{ARTICLE_URL}/{key}/{value}?view=ENTITLED&httpAccept=application/pdf"
        pdf_path = DL_DIR / f"{value.replace('/', '_')}.pdf"
        if save_article(pdf_url, pdf_path, HDR_PDF):
            if pdf_path.stat().st_size > 100_000:
                if is_multi_page(pdf_path):
                    return pdf_path
            pdf_path.unlink(missing_ok=True)

        # 2) author-manuscript PDF
        log(f"API-AMS       → {key}:{value}")
        ams_url  = f"{ARTICLE_URL}/{key}/{value}?view=ENTITLED&httpAccept=application/pdf&amsRedirect=true"
        ams_path = DL_DIR / f"{value.replace('/', '_')}_AM.pdf"
        if save_article(ams_url, ams_path, HDR_PDF):
            if ams_path.stat().st_size > 100_000:
                if is_multi_page(ams_path):
                    return ams_path
            ams_path.unlink(missing_ok=True)

        # 3) FULL-view XML
        log(f"API-FULL-XML  → {key}:{value}")
        xml_url  = f"{ARTICLE_URL}/{key}/{value}?view=FULL"
        xml_path = DL_DIR / f"{value.replace('/', '_')}.xml"
        if save_article(xml_url, xml_path, HDR_XML):
            with xml_path.open("rb") as fh:
                if FULL_TEXT_MARKER.search(fh.read(8192)):
                    return xml_path
            xml_path.unlink(missing_ok=True)

    # 4) legacy fall-back (publisher PDF via generic endpoint)
    doi = ids.get("doi")
    if not doi:
        return None
    legacy_path = DL_DIR / f"{doi.replace('/', '_')}.pdf"
    log(f"LEGACY        → doi:{doi}")
    if save_article(f"{ARTICLE_URL}/doi/{doi}", legacy_path, HDR_PDF):
        if legacy_path.stat().st_size > 100_000:
            if is_multi_page(legacy_path):
                return legacy_path
        legacy_path.unlink(missing_ok=True)
    return None
    

# ------------------------------------------------------------------
def try_unpaywall(doi):
    if not doi:
        return None
    meta = S.get(
        f"https://api.unpaywall.org/v2/{doi}", params={"email": EMAIL}, timeout=15
    )
    time.sleep(DELAY)
    pdf = (meta.json().get("best_oa_location") or {}).get("url_for_pdf")
    if pdf:
        fname = DL_DIR / f"{doi.replace('/', '_')}_OA.pdf"
        if fname.exists() or save_article(pdf, fname, hdr={}):
            return fname
    return None


# ------------------------------------------------------------------

# ------------------------------------------------------------------
# Main sampling loop
# ------------------------------------------------------------------
fulltext_records, seen_eids = [], set()
upper_bound = min(
    MAX_OFFSET,
    int(r_count.json()["search-results"]["opensearch:totalResults"]) - SEARCH_PAGE_SZ,
)

while len(fulltext_records) < TARGET:
    start = random.randint(0, upper_bound)
    params = {"query": encoded_query, "count": SEARCH_PAGE_SZ, "start": start}
    page = requests.get(SEARCH_URL, headers=hdrs_core, params=params)
    page.raise_for_status()
    entries = page.json()["search-results"].get("entry", [])
    log(f"Scanned page @offset {start:,}")

    for e in entries:
        ids = extract_data(e)
        if ids["eid"] in seen_eids:  # avoid duplicates
            continue

        # ---------- attempt full-text ----------
        path = try_sciencedirect(ids)
        if not path:
            path = try_unpaywall(ids["doi"])
        if path:
            ids["full_text"] = str(path)
            fulltext_records.append(ids)
        else:
            log("  ✖ no full text")

        if len(fulltext_records) == TARGET:
            break

df_full = pd.DataFrame(fulltext_records)
cols = ["title", "journal", "authors", "year", "doi", "full_text"]
display(df_full[cols])
df_full.to_csv("../data/random25_fulltext.csv", index=False)
log("Completed – 25 full-text documents obtained.")

12:14:08 Scanned page @offset 3,599
12:14:08 API-ENTITLED → doi:10.1093/jpe/rtx037
12:14:09 API-AMS       → doi:10.1093/jpe/rtx037
12:14:10 API-FULL-XML  → doi:10.1093/jpe/rtx037
12:14:11 API-ENTITLED → eid:2-s2.0-85038372433
12:14:12 API-AMS       → eid:2-s2.0-85038372433
12:14:12 API-FULL-XML  → eid:2-s2.0-85038372433
12:14:13 LEGACY        → doi:10.1093/jpe/rtx037
12:14:15   ✖ no full text
12:14:15 API-ENTITLED → doi:10.1016/j.surfcoat.2017.11.053
12:14:16 API-AMS       → doi:10.1016/j.surfcoat.2017.11.053
12:14:17 API-FULL-XML  → doi:10.1016/j.surfcoat.2017.11.053
12:14:17 API-ENTITLED → pii:S025789721731201X
12:14:18 API-AMS       → pii:S025789721731201X
12:14:19 API-FULL-XML  → pii:S025789721731201X
12:14:20 API-ENTITLED → eid:2-s2.0-85036527773
12:14:20 API-AMS       → eid:2-s2.0-85036527773
12:14:21 API-FULL-XML  → eid:2-s2.0-85036527773
12:14:21 LEGACY        → doi:10.1016/j.surfcoat.2017.11.053
12:14:25 API-ENTITLED → doi:10.1002/adma.201703653
12:14:25 API-AMS       → doi:10

Unnamed: 0,title,journal,authors,year,doi,full_text
0,Application of superhydrophobic coatings as a ...,Surface and Coatings Technology,Vazirinasab E.,2018,10.1016/j.surfcoat.2017.11.053,../data/downloads/10.1016_j.surfcoat.2017.11.0...
1,Epiisopilosine alkaloid has activity against s...,PLoS ONE,Guimarães M.A.,2018,10.1371/journal.pone.0196667,../data/downloads/10.1371_journal.pone.0196667...
2,Surface properties and permeability to calcium...,Frontiers in Plant Science,Bahamonde H.A.,2018,10.3389/fpls.2018.00494,../data/downloads/10.3389_fpls.2018.00494_OA.pdf
3,Nano-selenium and its nanomedicine application...,International Journal of Nanomedicine,Hosnedlova B.,2018,10.2147/IJN.S157541,../data/downloads/10.2147_IJN.S157541_OA.pdf
4,Toxicological and bioactivity evaluation of bl...,Food and Chemical Toxicology,Pap N.,2021,10.1016/j.fct.2021.112284,../data/downloads/10.1016_j.fct.2021.112284.pdf
5,Sporulation is dispensable for the vegetable-a...,Microbial Biotechnology,Antequera-Gómez M.L.,2021,10.1111/1751-7915.13816,../data/downloads/10.1111_1751-7915.13816_OA.pdf
6,Zoxamide accumulation and retention evaluation...,Pest Management Science,Corrias F.,2021,10.1002/ps.6404,../data/downloads/10.1002_ps.6404_OA.pdf
7,Quantitative trait locus mapping combined with...,Theoretical and Applied Genetics,Qi P.,2021,10.1007/s00122-021-03798-y,../data/downloads/10.1007_s00122-021-03798-y_O...
8,Peptide-mediated targeting of nanoparticles wi...,Bio-protocol,Santana I.,2021,10.21769/BioProtoc.4060,../data/downloads/10.21769_BioProtoc.4060_OA.pdf
9,Evaporating droplets on inclined plant leaves ...,Journal of Colloid and Interface Science,Tredenick E.C.,2021,10.1016/j.jcis.2021.01.070,../data/downloads/10.1016_j.jcis.2021.01.070_O...


12:26:29 Completed – 25 full-text documents obtained.


In [13]:
print(df_full)

                   eid                              doi                pii  \
0   2-s2.0-85036527773   10.1016/j.surfcoat.2017.11.053  S025789721731201X   
1   2-s2.0-85046939180     10.1371/journal.pone.0196667               None   
2   2-s2.0-85046888342          10.3389/fpls.2018.00494               None   
3   2-s2.0-85045315493              10.2147/IJN.S157541               None   
4   2-s2.0-85106525677        10.1016/j.fct.2021.112284  S0278691521003173   
5   2-s2.0-85105223127          10.1111/1751-7915.13816               None   
6   2-s2.0-85104813835                  10.1002/ps.6404               None   
7   2-s2.0-85103218996       10.1007/s00122-021-03798-y               None   
8   2-s2.0-85115997253          10.21769/BioProtoc.4060               None   
9   2-s2.0-85103683996       10.1016/j.jcis.2021.01.070  S0021979721000837   
10  2-s2.0-85099790524   10.1016/j.foodchem.2021.129003  S0308814621000042   
11  2-s2.0-85108796586        10.3390/pathogens10060746         