In [49]:
%load_ext dotenv
%dotenv ../.env


The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


## Question 1: How much data

Search 1: A simple, broad search of literature regarding hydrophobicity in plant leaves.

`Hydrophobicity AND plant leaves`



In [50]:
import os
import requests
import pandas as pd
from urllib.parse import quote_plus      # quote_plus turns blanks → “+” and quotes → “%22”
from dotenv import load_dotenv

load_dotenv()
api_key = os.getenv("API_KEY")


query_string = 'Hydrophobi*+"plant+leaves"'    
encoded_query = query_string               # → Hydrophobicity+AND+%22plant+leaves%22

RESOURCE_URL = "https://api.elsevier.com/content/search/scopus"
headers = {
    "X-ELS-APIKey": api_key,
    "Accept": "application/json",
}
params_counting_only = {
    "query": encoded_query,
    "count": 0,
}

# Hit count
r_count = requests.get(RESOURCE_URL, headers=headers, params=params_counting_only)
r_count.raise_for_status()

total_hits = int(r_count.json()["search-results"]["opensearch:totalResults"])

print(f"Total hits for {encoded_query}: {total_hits}")

LIMIT = 25

params = {
    "query": encoded_query,
    "count": LIMIT,
}

resp = requests.get(RESOURCE_URL, headers=headers, params=params)

if resp.status_code != 200:
    raise RuntimeError(f"Scopus API error {resp.status_code}: {resp.text}")

entries = resp.json().get("search-results", {}).get("entry", [])
records = [
    {
        "Title":   e.get("dc:title"),
        "Authors": e.get("dc:creator"),
        "Year":    e.get("prism:coverDate", "").split("-")[0],
        "URL":     e.get("prism:url"),
    }
    for e in entries
]
df = pd.DataFrame(records)
print(f"Returned {len(df)} documents")
display(df)          # in a notebook this renders a nice HTML table
# df.to_csv("scopus_results.csv", index=False)   # optional persist

Total hits for Hydrophobi*+"plant+leaves": 6215
Returned 25 documents


Unnamed: 0,Title,Authors,Year,URL
0,The suberin transporter StABCG1 is required fo...,Benatto Perino E.H.,2025,https://api.elsevier.com/content/abstract/scop...
1,Combining plant extracts and hot water treatme...,Worku S.,2025,https://api.elsevier.com/content/abstract/scop...
2,Sustainable pest management using plant second...,Zhang X.,2025,https://api.elsevier.com/content/abstract/scop...
3,Plant Cell Wall-Like Soft Materials: Micro- an...,Koshani R.,2025,https://api.elsevier.com/content/abstract/scop...
4,Anti-proliferative and photodynamic activities...,Olofinsan K.A.,2025,https://api.elsevier.com/content/abstract/scop...
5,Optimization of betulinic and ursolic acids an...,Demiray H.,2025,https://api.elsevier.com/content/abstract/scop...
6,De novo assembly of the mitochondrial genome o...,Zhou G.,2025,https://api.elsevier.com/content/abstract/scop...
7,Discovery of potent anti-toxoplasmosis drugs f...,Mohammed M.M.D.,2025,https://api.elsevier.com/content/abstract/scop...
8,The nano-paradox: addressing nanotoxicity for ...,Rajpal V.R.,2025,https://api.elsevier.com/content/abstract/scop...
9,Understanding ultrafast free-rising bubble cap...,Hu Y.,2025,https://api.elsevier.com/content/abstract/scop...


Here we have a good first look at some very coarse-grained results. Let's go in and use random offsets to retrieve 25 random documents.

In [54]:
import random

# ------------------------------------------------------------
# 4. grab 25 random records in 5 small pages (5 records each)
# ------------------------------------------------------------
SAMPLE_PAGES = 5
PAGE_SIZE    = 5
MAX_OFFSET   = 5000 - PAGE_SIZE          # Scopus hard limit with `start=` paging

if total_hits <= SAMPLE_PAGES * PAGE_SIZE:
    print("\nNo random sampling necessary – the result-set is small.")
else:
    upper_bound = min(total_hits - PAGE_SIZE, MAX_OFFSET)
    random_starts = random.sample(range(0, upper_bound + 1), SAMPLE_PAGES)

    print("Sampling from the following offsets:")
    print(random_starts)

    sample_records = []
    for s in sorted(random_starts):
        params_rand = {
            "query": encoded_query,
            "count": PAGE_SIZE,
            "start": s                 # 0-based offset into the results
        }
        r_rand = requests.get(RESOURCE_URL, headers=headers, params=params_rand)
        r_rand.raise_for_status()

        for e in r_rand.json()["search-results"].get("entry", []):
            sample_records.append({
                "Title":   e.get("dc:title"),
                "Authors": e.get("dc:creator"),
                "Year":    e.get("prism:coverDate", "").split("-")[0],
                "URL":     e.get("prism:url"),
            })

    df_rand = pd.DataFrame(sample_records)
    print(f"\nRandom sample of {len(df_rand)} documents "
          f"pulled in {SAMPLE_PAGES} queries "
          f"(offsets ≤ {MAX_OFFSET}).")
    display(df_rand)

Sampling from the following offsets:
[3265, 4235, 4803, 1212, 3604]

Random sample of 25 documents pulled in 5 queries (offsets ≤ 4995).


Unnamed: 0,Title,Authors,Year,URL
0,Unraveling the interplay of leaf structure and...,Ganar S.S.,2023,https://api.elsevier.com/content/abstract/scop...
1,Dehydrin CaDHN2 Enhances Drought Tolerance by ...,Li X.,2023,https://api.elsevier.com/content/abstract/scop...
2,Foliar Application of Sulfur-Containing Compou...,Bouranis D.L.,2023,https://api.elsevier.com/content/abstract/scop...
3,Neither lysigenous nor just oil: Demystifying ...,Richit J.F.,2023,https://api.elsevier.com/content/abstract/scop...
4,Particle Size Effect of Cyetpyrafen Formulatio...,Yu L.,2023,https://api.elsevier.com/content/abstract/scop...
5,Surface topographies of biomimetic superamphip...,Gou X.,2019,https://api.elsevier.com/content/abstract/scop...
6,"Rotator phases in alkane systems: In bulk, sur...",Cholakova D.,2019,https://api.elsevier.com/content/abstract/scop...
7,Hydrophobin HFBII-4 from Trichoderma asperellu...,Zhang H.,2019,https://api.elsevier.com/content/abstract/scop...
8,Metabolic markers for the yield of lipophilic ...,Nguyen T.K.O.,2019,https://api.elsevier.com/content/abstract/scop...
9,An investigation on the leaf accumulation-remo...,Zhang L.,2019,https://api.elsevier.com/content/abstract/scop...
