In [74]:
%load_ext dotenv
%dotenv ../.env


The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


## Question 1: How much data

Search 1: A simple, broad search of literature regarding hydrophobicity in plant leaves.

`Hydrophobicity AND plant leaves`



In [75]:
import os
import requests
import pandas as pd
import random
import pathlib
import re
import time
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("API_KEY")


query_string = 'Hydrophobi*+"plant+leaves"'
encoded_query = query_string  # → Hydrophobicity+AND+%22plant+leaves%22

RESOURCE_URL = "https://api.elsevier.com/"
SEARCH_URL = f"{RESOURCE_URL}/content/search/scopus"
ARTICLE_URL = f"{RESOURCE_URL}/content/article"

headers = {
    "X-ELS-APIKey": api_key,
    "Accept": "application/json",
}
params_counting_only = {
    "query": encoded_query,
    "count": 0,
}

# Hit count
r_count = requests.get(SEARCH_URL, headers=headers, params=params_counting_only)
r_count.raise_for_status()

total_hits = int(r_count.json()["search-results"]["opensearch:totalResults"])

print(f"Total hits for {encoded_query}: {total_hits}")

LIMIT = 25

params = {
    "query": encoded_query,
    "count": LIMIT,
}

resp = requests.get(SEARCH_URL, headers=headers, params=params)

if resp.status_code != 200:
    raise RuntimeError(f"Scopus API error {resp.status_code}: {resp.text}")

entries = resp.json().get("search-results", {}).get("entry", [])
records = [
    {
        "Title": e.get("dc:title"),
        "Authors": e.get("dc:creator"),
        "Year": e.get("prism:coverDate", "").split("-")[0],
        "URL": e.get("prism:url"),
    }
    for e in entries
]
df = pd.DataFrame(records)
print(f"Returned {len(df)} documents")
display(df)  # in a notebook this renders a nice HTML table
# df.to_csv("scopus_results.csv", index=False)   # optional persist

Total hits for Hydrophobi*+"plant+leaves": 6218
Returned 25 documents


Unnamed: 0,Title,Authors,Year,URL
0,The suberin transporter StABCG1 is required fo...,Benatto Perino E.H.,2025,https://api.elsevier.com/content/abstract/scop...
1,Combining plant extracts and hot water treatme...,Worku S.,2025,https://api.elsevier.com/content/abstract/scop...
2,Sustainable pest management using plant second...,Zhang X.,2025,https://api.elsevier.com/content/abstract/scop...
3,Plant Cell Wall-Like Soft Materials: Micro- an...,Koshani R.,2025,https://api.elsevier.com/content/abstract/scop...
4,Anti-proliferative and photodynamic activities...,Olofinsan K.A.,2025,https://api.elsevier.com/content/abstract/scop...
5,Optimization of betulinic and ursolic acids an...,Demiray H.,2025,https://api.elsevier.com/content/abstract/scop...
6,De novo assembly of the mitochondrial genome o...,Zhou G.,2025,https://api.elsevier.com/content/abstract/scop...
7,Discovery of potent anti-toxoplasmosis drugs f...,Mohammed M.M.D.,2025,https://api.elsevier.com/content/abstract/scop...
8,The nano-paradox: addressing nanotoxicity for ...,Rajpal V.R.,2025,https://api.elsevier.com/content/abstract/scop...
9,Understanding ultrafast free-rising bubble cap...,Hu Y.,2025,https://api.elsevier.com/content/abstract/scop...


Here we have a good first look at some very coarse-grained results. Let's go in and use random offsets to retrieve 25 random documents.

In [76]:
from requests.adapters import HTTPAdapter, Retry


# ------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------
load_dotenv()
API_KEY = os.getenv("API_KEY")  # ← your Elsevier key
EMAIL = os.getenv("EMAIL")  # ← required by Unpaywall
DL_DIR = pathlib.Path("downloads")
DL_DIR.mkdir(exist_ok=True)

TARGET = 25  # how many full-text docs we want
SEARCH_PAGE_SZ = 25  # pull 25 hits per random page
MAX_OFFSET = 5000 - SEARCH_PAGE_SZ  # Scopus hard cap with start=

hdrs_core = {"X-ELS-APIKey": API_KEY}
hdrs_xml = {**hdrs_core, "Accept": "text/xml"}
hdrs_pdf = {**hdrs_core, "Accept": "application/pdf"}


def log(msg):
    print(time.strftime("%H:%M:%S"), msg)


# ------------------------------------------------------------------
# Helper functions
# ------------------------------------------------------------------
def extract_data(entry: dict) -> dict:
    """Return data (if present) from a search entry."""
    eid = entry.get("eid") or entry.get("dc:identifier", "").split(":")[-1]
    return {
        "eid": eid,
        "doi": entry.get("prism:doi"),
        "pii": entry.get("pii"),
        "title": entry.get("dc:title", "")[:80] + "…",
        "authors": entry.get("dc:creator"),
        "journal": entry.get("prism:publicationName"),
        "year": entry.get("prism:coverDate", "").split("-")[0],
    }


# ... keep existing imports

# ------------------------------------------------------------------
#  robust Session  (re-use everywhere)
# ------------------------------------------------------------------
S = requests.Session()
retries = Retry(
    total=4,  # max 4 attempts
    backoff_factor=0.4,  # 0.4, 0.8, 1.6, 3.2 s
    status_forcelist=[502, 503, 504, 429],
    allowed_methods=["GET"],
)
S.mount("https://", HTTPAdapter(max_retries=retries))
S.headers.update({"X-ELS-APIKey": API_KEY})  # applied to every request

# dedicated headers for full-text
HDR_XML = {"Accept": "text/xml"}
HDR_PDF = {"Accept": "application/pdf"}

DELAY = 0.3  # polite pause after every outgoing request


# ------------------------------------------------------------------
def save(url: str, path: pathlib.Path, hdr: dict) -> bool:
    """Try to download url → path (with retries); return True if saved."""
    try:
        r = S.get(url, headers=hdr, stream=True, timeout=30, allow_redirects=True)
        time.sleep(DELAY)
        if r.status_code == 200:
            with path.open("wb") as fh:
                for chunk in r.iter_content(8192):
                    fh.write(chunk)
            return True
    except requests.RequestException as exc:
        log(f"      …download error ({exc.__class__.__name__}) – skipped")
    return False


# ------------------------------------------------------------------
FULL_TEXT_MARKER = re.compile(rb"<ce:section\b", re.I)

def try_sciencedirect(ids):
    """Return path of saved PDF or *real* full-text XML, else None."""
    for key in ("doi", "pii", "eid"):
        val = ids.get(key)
        if not val:
            continue

        # ---------- 1) PDF first ----------
        pdf_name = DL_DIR / f"{val.replace('/', '_')}.pdf"
        if pdf_name.exists() or save(f"{ARTICLE_URL}/{key}/{val}", pdf_name, HDR_PDF):
            return pdf_name

        # ---------- 2) XML – but keep only if it looks like FULL view ----------
        xml_name = DL_DIR / f"{val.replace('/', '_')}.xml"
        if xml_name.exists() or save(f"{ARTICLE_URL}/{key}/{val}", xml_name, HDR_XML):
            # peek at first 8 kB to decide
            with xml_name.open("rb") as fh:
                head = fh.read(8192)
            if FULL_TEXT_MARKER.search(head):
                return xml_name
            else:
                xml_name.unlink(missing_ok=True)   # drop stub
    return None


# ------------------------------------------------------------------
def try_unpaywall(doi):
    if not doi:
        return None
    meta = S.get(
        f"https://api.unpaywall.org/v2/{doi}", params={"email": EMAIL}, timeout=15
    )
    time.sleep(DELAY)
    pdf = (meta.json().get("best_oa_location") or {}).get("url_for_pdf")
    if pdf:
        fname = DL_DIR / f"{doi.replace('/', '_')}_OA.pdf"
        if fname.exists() or save(pdf, fname, hdr={}):
            return fname
    return None


# ------------------------------------------------------------------

# ------------------------------------------------------------------
# Main sampling loop
# ------------------------------------------------------------------
fulltext_records, seen_eids = [], set()
upper_bound = min(
    MAX_OFFSET,
    int(r_count.json()["search-results"]["opensearch:totalResults"]) - SEARCH_PAGE_SZ,
)

while len(fulltext_records) < TARGET:
    start = random.randint(0, upper_bound)
    params = {"query": encoded_query, "count": SEARCH_PAGE_SZ, "start": start}
    page = requests.get(SEARCH_URL, headers=hdrs_core, params=params)
    page.raise_for_status()
    entries = page.json()["search-results"].get("entry", [])
    log(f"Scanned page @offset {start:,}")

    for e in entries:
        ids = extract_data(e)
        if ids["eid"] in seen_eids:  # avoid duplicates
            continue
        seen_eids.add(ids["eid"])

        # ---------- attempt full-text ----------
        path = try_sciencedirect(ids)
        if not path:
            path = try_unpaywall(ids["doi"])
        if path:
            ids["full_text"] = str(path)
            fulltext_records.append(ids)
            log(f"  ✔ saved {path.name}   ({len(fulltext_records)}/{TARGET})")
            break  # loop will pick a new offset
        else:
            log("  ✖ no full text")

        if len(fulltext_records) == TARGET:
            break

df_full = pd.DataFrame(fulltext_records)
cols = ["title", "journal", "authors", "year", "doi", "full_text"]
display(df_full[cols])
df_full.to_csv("random25_fulltext.csv", index=False)
log("Completed – 25 full-text documents obtained.")

15:28:03 Scanned page @offset 533
15:28:04   ✔ saved 10.1016_j.jobe.2024.110635.pdf   (1/25)
15:28:05 Scanned page @offset 1,255
15:28:07   ✔ saved 10.1016_j.jsps.2023.101762.pdf   (2/25)
15:28:08 Scanned page @offset 4,219
15:28:11   ✖ no full text
15:28:12   ✔ saved 10.1016_j.plaphy.2015.09.007.pdf   (3/25)
15:28:13 Scanned page @offset 4,274
15:28:14   ✔ saved 10.4315_0362-028X.JFP-15-056.pdf   (4/25)
15:28:15 Scanned page @offset 625
15:28:16   ✔ saved 10.1016_j.sajb.2024.07.048.pdf   (5/25)
15:28:17 Scanned page @offset 2,374
15:28:24   ✔ saved 10.3389_fmicb.2021.738058_OA.pdf   (6/25)
15:28:25 Scanned page @offset 4,459
15:28:26   ✔ saved 10.1016_j.msec.2014.07.022.pdf   (7/25)
15:28:26 Scanned page @offset 3,639
15:28:29   ✖ no full text
15:28:30   ✔ saved 10.1016_j.phymed.2018.03.051.pdf   (8/25)
15:28:31 Scanned page @offset 1,075
15:28:34   ✖ no full text
15:28:37   ✖ no full text
15:28:44   ✔ saved 10.3389_fsufs.2024.1343615_OA.pdf   (9/25)
15:28:45 Scanned page @offset 4,05

Unnamed: 0,title,journal,authors,year,doi,full_text
0,Preparation of composite superhydrophobic coat...,Journal of Building Engineering,Lu L.,2024,10.1016/j.jobe.2024.110635,downloads/10.1016_j.jobe.2024.110635.pdf
1,Multi-target action of Garcinia livingstonei e...,Saudi Pharmaceutical Journal,Abdul-Rahman A.M.,2023,10.1016/j.jsps.2023.101762,downloads/10.1016_j.jsps.2023.101762.pdf
2,Identification and functional analysis of the ...,Plant Physiology and Biochemistry,Deng Y.,2015,10.1016/j.plaphy.2015.09.007,downloads/10.1016_j.plaphy.2015.09.007.pdf
3,Role of cellulose and colanic acid in attachme...,Journal of Food Protection,Lee C.C.,2015,10.4315/0362-028X.JFP-15-056,downloads/10.4315_0362-028X.JFP-15-056.pdf
4,Exploring the cytoprotective potential of flav...,South African Journal of Botany,Kumar S.S.,2024,10.1016/j.sajb.2024.07.048,downloads/10.1016_j.sajb.2024.07.048.pdf
5,Prospecting Endophytic Bacteria Endowed With P...,Frontiers in Microbiology,Hazarika S.N.,2021,10.3389/fmicb.2021.738058,downloads/10.3389_fmicb.2021.738058_OA.pdf
6,Preparation and hydrophobicity of biomorphic Z...,Materials Science and Engineering C,Wang T.,2014,10.1016/j.msec.2014.07.022,downloads/10.1016_j.msec.2014.07.022.pdf
7,Anti-inflammatory and antioxidant activities o...,Phytomedicine,El-desoky A.H.,2018,10.1016/j.phymed.2018.03.051,downloads/10.1016_j.phymed.2018.03.051.pdf
8,Effects of alkaline and ultrasonication on duc...,Frontiers in Sustainable Food Systems,Nitiwuttithorn C.,2024,10.3389/fsufs.2024.1343615,downloads/10.3389_fsufs.2024.1343615_OA.pdf
9,Fabrication of biomimetic superhydrophobic ste...,Applied Surface Science,Yin L.,2016,10.1016/j.apsusc.2016.02.090,downloads/10.1016_j.apsusc.2016.02.090.pdf


15:30:26 Completed – 25 full-text documents obtained.


In [77]:
print(df_full)

                   eid                               doi                pii  \
0   2-s2.0-85202719697        10.1016/j.jobe.2024.110635  S2352710224022034   
1   2-s2.0-85170032592        10.1016/j.jsps.2023.101762  S1319016423002578   
2   2-s2.0-84942523475      10.1016/j.plaphy.2015.09.007  S0981942815301091   
3   2-s2.0-84938309340      10.4315/0362-028X.JFP-15-056               None   
4   2-s2.0-85199867771        10.1016/j.sajb.2024.07.048  S0254629924004575   
5   2-s2.0-85117092722         10.3389/fmicb.2021.738058               None   
6   2-s2.0-84905157351        10.1016/j.msec.2014.07.022  S0928493114004299   
7   2-s2.0-85044764801      10.1016/j.phymed.2018.03.051  S0944711318300862   
8   2-s2.0-85186573407        10.3389/fsufs.2024.1343615               None   
9   2-s2.0-84958211906      10.1016/j.apsusc.2016.02.090  S0169433216302598   
10  2-s2.0-85067962395     10.1016/j.jallcom.2019.04.169  S0925838819314501   
11  2-s2.0-85136691248  10.1016/j.orggeochem.2022.10