# Step 1 Get New Data

## Libraries

In [12]:
#!/usr/bin/env python3
import math, requests, pandas as pd, re
import textwrap
import numpy as np
from pathlib import Path
import os

## OpenAlex

We will use OpenAlex to mimic the searches from the original LAB within EBSCO given the overlap in content and moving to a reproducible pipeline. This open resource includes the journals from previously investigated searches including Behavior Research Methods, Language Resources and Evaluation, and PLoS One. 

In [5]:
base_url = (
    "https://api.openalex.org/works?"
    "filter=title_and_abstract.search:lexical+database+OR+lexical+norms+OR+linguistic+database+OR+linguistic+norms,"
    "publication_year:2018-2025,"
    "type:types/article|types/dataset|types/preprint|types/supplementary-materials|types/report|types/book-chapter"
    "&sort=relevance_score:desc"
    "&per_page=200"     # bump page size to reduce calls
)

def decode_abstract(inv):
    if not isinstance(inv, dict) or not inv: return None
    pos2tok = {p:t for t,ps in inv.items() for p in ps}
    txt = " ".join(pos2tok.get(i,"") for i in range(max(pos2tok)+1))
    txt = re.sub(r"\s+([,.!?;:])", r"\1", txt)
    return re.sub(r"\s{2,}", " ", txt).strip() or None

# probe for total
probe = requests.get(base_url + "&page=1", timeout=30)
probe.raise_for_status()
meta = probe.json()["meta"]
total, per_page = meta["count"], meta["per_page"]
pages = math.ceil(total / per_page)
print(f"total={total}, per_page={per_page}, pages={pages}")

rows = []
for p in range(1, pages+1):
    r = requests.get(base_url + f"&page={p}", timeout=60)
    r.raise_for_status()
    for w in r.json().get("results", []):
        rows.append({
            "title": w.get("title"),
            "year": w.get("publication_year"),
            "doi": (w.get("doi") or "").replace("https://doi.org/", ""),
            "venue": ((w.get("primary_location") or {}).get("source") or {}).get("display_name"),
            "authors": "; ".join(
                name
                for name in (
                    a.get("author", {}).get("display_name")
                    for a in w.get("authorships", [])
                )
                if name
            ),
            "abstract": decode_abstract(w.get("abstract_inverted_index")),
            # OpenAlex doesn’t store author-entered keywords; concepts are the closest proxy
            "keywords": [c["display_name"] for c in w.get("concepts", [])],
            "openalex_id": w.get("id"),
            "is_oa": (w.get("open_access") or {}).get("is_oa"),
            "cited_by": w.get("cited_by_count", 0),
        })
    print(f"page {p}/{pages}… collected {len(rows)}", end="\r")

df = pd.DataFrame(rows)
print("\nDone. Rows fetched:", len(df))
df.head()

total=1893, per_page=200, pages=10
page 10/10… collected 1893
Done. Rows fetched: 1893


Unnamed: 0,title,year,doi,venue,authors,abstract,keywords,openalex_id,is_oa,cited_by
0,On the predictive validity of various corpus-b...,2018,10.3758/s13428-017-1001-8,Behavior Research Methods,Xiaocong Chen; Yanping Dong; Xiufen Yu,,"[Lexical diversity, Computer science, Lemma (b...",https://openalex.org/W2784175655,True,51
1,Predicting Lexical Norms: A Comparison between...,2018,10.5334/joc.50,Journal of Cognition,Hendrik Vankrunkelsven; Steven Verheyen; Gert ...,In two studies we compare a distributional sem...,"[Concreteness, Word Association, Word (group t...",https://openalex.org/W2902591385,True,61
2,"Norms of conceptual familiarity for 3,596 Fren...",2018,10.3758/s13428-018-1106-8,Behavior Research Methods,Georges Chedid; Maximiliano A. Wilson; Christo...,,"[Noun, Linguistics, Lexical decision task, Psy...",https://openalex.org/W2888587255,True,20
3,Psycholinguistic norms for more than 300 lexic...,2021,10.3758/s13428-020-01524-y,Behavior Research Methods,Patrick C. Trettenbrein; Nina-Kristin Pendzich...,,"[Iconicity, Age of Acquisition, German, Comput...",https://openalex.org/W2980432777,True,20
4,Multilingual lexical transfer challenges monol...,2021,10.1515/multi-2021-0014,Multilingua,Eliane Lorenz; Yevheniia Hasai; Peter Siemund,Abstract Foreign language learners frequently ...,"[Linguistics, German, Turkish, Lexical item, L...",https://openalex.org/W3211361532,True,8


## Examine Abstracts

We need to examine if all articles have abstracts for being able to predict.

In [6]:
def summarize_abstracts(df: pd.DataFrame, n_show: int = 5):
    # treat empty strings/whitespace as missing
    has_abs = df["abstract"].astype("string").str.strip().ne("").fillna(False)

    total = len(df)
    with_abs = int(has_abs.sum())
    without_abs = total - with_abs
    pct = (with_abs / total * 100) if total else 0.0

    print(f"Total rows: {total}")
    print(f"With abstract: {with_abs} ({pct:.1f}%)")
    print(f"Missing abstract: {without_abs}")

    if without_abs:
        # show a few examples that are missing
        missing = df.loc[~has_abs, ["title", "year", "doi", "venue", "openalex_id"]].head(n_show)
        print("\nExamples missing abstracts:")
        for _, r in missing.iterrows():
            print("•", r["year"], "|", (r["title"] or "")[:120].rstrip(), "|", r["venue"] or "", "| DOI:", r["doi"] or "—")

    return has_abs

# Run the summary on your df
has_abs_mask = summarize_abstracts(df, n_show=30)

Total rows: 1893
With abstract: 1789 (94.5%)
Missing abstract: 104

Examples missing abstracts:
• 2018 | On the predictive validity of various corpus-based frequency norms in L2 English lexical processing | Behavior Research Methods | DOI: 10.3758/s13428-017-1001-8
• 2018 | Norms of conceptual familiarity for 3,596 French nouns and their contribution in lexical decision | Behavior Research Methods | DOI: 10.3758/s13428-018-1106-8
• 2021 | Psycholinguistic norms for more than 300 lexical signs in German Sign Language (DGS) | Behavior Research Methods | DOI: 10.3758/s13428-020-01524-y
• 2020 | Norm It! : Lexical Normalization for Italian and Its Downstream Effects for Dependency Parsing | University of Groningen research database (University of Groningen / Centre for Information Technology) | DOI: —
• 2022 | The Flickr frequency norms: What 17 years of images tagged online tell us about lexical processing | Behavior Research Methods | DOI: 10.3758/s13428-022-02031-y
• 2022 | Translation 

In [7]:
# ---- polite headers (some APIs appreciate a contact) ----
CONTACT_EMAIL = "ebuchanan@harrisburgu.edu"  # set yours
HEADERS = {"Accept": "application/json", "User-Agent": f"LAB-abstract-enricher ({CONTACT_EMAIL})"}

def _clean_doi(doi: str) -> str:
    if not doi: return ""
    doi = doi.strip()
    return re.sub(r"^https?://(dx\.)?doi\.org/", "", doi, flags=re.I)

def safe_request(url, params=None, headers=None):
    try:
        r = requests.get(url, params=params, headers=headers, timeout=30)
        r.raise_for_status()
        return r.json(), None
    except requests.exceptions.HTTPError as e:
        return None, f"HTTP {r.status_code}: {r.text[:200]}"
    except Exception as e:
        return None, f"Other error: {str(e)}"

def fetch_crossref_abstract(doi):
    doi = _clean_doi(doi)
    url = f"https://api.crossref.org/works/{doi}"
    js, err = safe_request(url, headers=HEADERS)
    if err: return None, "ERROR", err
    abs_ = (js.get("message") or {}).get("abstract")
    if abs_:
        # strip tags
        abs_ = re.sub(r"<[^>]+>", "", abs_)
        return abs_.strip(), "Crossref", None
    return None, "MISSING", None

def fetch_europepmc_abstract(doi):
    url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    params = {"query": f"DOI:{doi}", "format": "json", "pageSize": 1}
    js, err = safe_request(url, params=params, headers=HEADERS)
    if err: return None, "ERROR", err
    res = js.get("resultList", {}).get("result", [])
    if res and res[0].get("abstractText"):
        return res[0]["abstractText"], "EuropePMC", None
    return None, "MISSING", None

def get_abstract_by_doi(doi):
    doi = _clean_doi(doi)
    if not doi: return None, "MISSING", None
    # try Crossref then Europe PMC
    for fetcher in (fetch_crossref_abstract, fetch_europepmc_abstract):
        abs_, status, err = fetcher(doi)
        if status == "ERROR":  # API error
            return None, status, err
        if status != "MISSING":  # success
            return abs_, status, None
    return None, "MISSING", None

def enrich_missing_abstracts(df, doi_col="doi", abs_col="abstract", sleep=0.3):
    """
    For rows where df[abs_col] is empty, try to fetch an abstract by DOI.
    Prints status for each attempt; only writes into df[abs_col] on success.
    Expects get_abstract_by_doi() -> (abstract, status, err).
    """
    import time
    import pandas as pd

    if abs_col not in df.columns:
        df[abs_col] = None

    mask_missing = df[abs_col].isna() | (df[abs_col].astype(str).str.strip() == "")
    idxs = df.index[mask_missing].tolist()

    for i in idxs:
        doi = str(df.at[i, doi_col] or "").strip()
        if not doi:
            print(f"[Row {i}] No DOI, skipping.")
            continue

        abs_, status, err = get_abstract_by_doi(doi)

        if status == "ERROR":
            print(f"[Row {i}] DOI {doi}: API error -> {err}")
        elif status == "MISSING":
            print(f"[Row {i}] DOI {doi}: No abstract found.")
        else:
            print(f"[Row {i}] DOI {doi}: Abstract found via {status}.")
            df.at[i, abs_col] = abs_

        time.sleep(sleep)  # be polite to APIs

    return df

In [8]:
# df = <your dataframe from OpenAlex>
# Summarize before:
missing_before = df["abstract"].isna() | (df["abstract"].astype(str).str.strip() == "")
print("Missing before:", int(missing_before.sum()), "of", len(df))

Missing before: 104 of 1893


In [9]:
df = enrich_missing_abstracts(df)

[Row 0] DOI 10.3758/s13428-017-1001-8: No abstract found.
[Row 2] DOI 10.3758/s13428-018-1106-8: No abstract found.
[Row 3] DOI 10.3758/s13428-020-01524-y: Abstract found via Crossref.
[Row 5] No DOI, skipping.
[Row 7] DOI 10.3758/s13428-022-02031-y: No abstract found.
[Row 8] DOI 10.3758/s13428-022-01977-3: No abstract found.
[Row 14] DOI 10.36059/978-966-397-124-7/39-56: No abstract found.
[Row 16] DOI 10.1007/978-3-031-27506-7_4: No abstract found.
[Row 20] DOI 10.26226/morressier.606f15dd30a2e980041f238c: No abstract found.
[Row 23] DOI 10.3758/s13428-019-01316-z: Abstract found via Crossref.
[Row 26] DOI 10.3758/s13428-018-1171-z: No abstract found.
[Row 29] DOI 10.1016/j.asw.2018.06.004: No abstract found.
[Row 30] DOI 10.3758/s13428-019-01337-8: No abstract found.
[Row 31] DOI 10.1016/j.neuroimage.2018.01.042: No abstract found.
[Row 32] DOI 10.1016/j.bandl.2019.05.003: No abstract found.
[Row 33] DOI 10.3758/s13428-018-1014-y: No abstract found.
[Row 35] DOI 10.3758/s13428-020-

In [None]:
missing_after = df["abstract"].isna() | (df["abstract"].astype(str).str.strip() == "")
print("Missing after:", int(missing_after.sum()), "of", len(df))

# Peek at newly-filled examples
# df.loc[df["abstract_source"].notna(), ["title", "doi", "abstract_source"]].head()

Missing after: 18 of 1641


## Write to CSV

Write the output for the next step. 

In [15]:
# write out final data to a consistent location in the repo
try:
    project_root = Path(__file__).resolve().parents[2]
except NameError:
    # running in a notebook → go up one folder
    project_root = Path(os.getcwd()).resolve().parents[0]

out_dir = project_root / "data" / "raw"
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "new_data_to_classify.csv"

df.to_csv(out_path, index=False)
print(f"Wrote {len(df)} rows to {out_path}")

Wrote 1893 rows to /Users/erinbuchanan/GitHub/Research/7_websites/manynorms/data/raw/new_data_to_classify.csv
