In [1]:
import requests, json, time
import pandas as pd
from collections import Counter, defaultdict
import fitz
import re

In [2]:
do_read_from_web = True
do_read_from_cvs = not do_read_from_web
include_dois = True
include_arxiv = True

In [3]:
pdf_file_name = "/home/yoren/yumvd.Yandex.Disk/Yura/Personal/grants/DOE_progress_report_publications_2023_2026.pdf"

base_url = "https://inspirehep.net/api/literature"

exclude_collabs = []


In [4]:


pdf_path = pdf_file_name


# read all text
doc = fitz.open(pdf_path)
text = " ".join(page.get_text("text") for page in doc)

# --- normalize all hyphen / dash variants ---
dash_chars = [
    "\u00ad",  # soft hyphen
    "\u2010",  # hyphen
    "\u2011",  # non-breaking hyphen
    "\u2012",  # figure dash
    "\u2013",  # en dash
    "\u2014",  # em dash
    "\u2015",  # horizontal bar
    "\u2212",  # minus sign
]
for ch in dash_chars:
    text = text.replace(ch, "-")

# remove newlines inside DOIs
text = re.sub(r'\s*-\s*(?=\d)', '-', text)  # fix broken numeric segments
text = re.sub(r'\s*\n\s*', ' ', text)        # flatten newlines

# --- extract DOIs ---
doi_pattern = r'10\.\d{4,9}/[A-Za-z0-9.\-()/]+'
dois = sorted(set(re.findall(doi_pattern, text)))

# --- extract arXiv IDs ---
# captures both old and new formats: e.g. arXiv:2509.13497 or arXiv:hep-ph/0501234
arxiv_pattern = r'arXiv:\s*[A-Za-z\-\.]*\d{4,5}\.\d{4,5}'
arxivs = sorted(set(re.findall(arxiv_pattern, text)))

# --- clean whitespace ---
arxivs = [a.replace(" ", "") for a in arxivs]

print(f"Found {len(dois)} DOIs and {len(arxivs)} arXiv IDs\n")
print("DOIs:")
for d in dois:
    print(" ", d)

print("\nArXiv preprints:")
for a in arxivs:
    print(" ", a)


Found 26 DOIs and 9 arXiv IDs

DOIs:
  10.1007/JHEP08(2025)075
  10.1016/j.nima.2024.170127
  10.1016/j.nima.2024.170171
  10.1016/j.physletb.2023.138101
  10.1088/1748-0221/18/06/C06024
  10.1103/6pmd-6dwr
  10.1103/PhysRevC.107.014907
  10.1103/PhysRevC.107.024907
  10.1103/PhysRevC.107.024914
  10.1103/PhysRevC.109.044912
  10.1103/PhysRevC.109.054910
  10.1103/PhysRevC.110.044901
  10.1103/PhysRevC.110.064905
  10.1103/PhysRevC.110.064909
  10.1103/PhysRevD.107.052012
  10.1103/PhysRevD.107.112004
  10.1103/PhysRevD.108.072016
  10.1103/PhysRevLett.130.251901
  10.1103/PhysRevLett.134.022302
  10.1103/h8d5-swg6
  10.1103/hpm9-
  10.1103/ptpm-jtt8
  10.1140/epjc/s10052-024-12987-0
  10.1140/epjc/s10052-024-13003-1
  10.1140/epjc/s10052-024-13115-8
  10.1140/epjc/s10052-024-13416-y

ArXiv preprints:
  arXiv:2303.17254
  arXiv:2409.03728
  arXiv:2411.11942
  arXiv:2412.08682
  arXiv:2412.14092
  arXiv:2504.02955
  arXiv:2507.04463
  arXiv:2507.04896
  arXiv:2509.13497


In [5]:
#if doi finishes with dashes, fix them
for i, d in enumerate(dois):
    if d.endswith('-'):
        print(i, d)
        d += 'qfp6'
    dois[i] = d

dois.append('10.1103/PhysRevC.109.044907')  # manual add

20 10.1103/hpm9-


In [6]:
records = []

# combine all identifiers
ids = []
if include_dois:
    ids.extend(dois)
if include_arxiv:
    ids.extend(arxivs)

for identifier in ids:
    # Detect whether it's a DOI or arXiv
    if identifier.lower().startswith("10."):
        query = f'doi:"{identifier}"'
    elif "arxiv:" in identifier.lower():
        query = f'arxiv_eprints.value:"{identifier.split(":")[-1]}"'
    else:
        continue

    params = {"q": query, "size": 1}

    try:
        r = requests.get(base_url, params=params, timeout=60)
        r.raise_for_status()
        data = r.json()
    except json.JSONDecodeError:
        print(f"⚠️ JSONDecodeError for {identifier}, skipping...")
        continue
    except requests.RequestException as e:
        print(f"❌ Request failed for {identifier}: {e}")
        continue

    hits = data.get("hits", {}).get("hits", [])
    if not hits:
        print(f"❌ No record found for {identifier}")
        continue

    for hit in hits:
        m = hit.get("metadata", {})

        # --- collaborations ---
        collabs = [c.get("value", "") for c in m.get("collaborations", [])]
        if any(c in exclude_collabs for c in collabs):
            continue

        # --- publication year ---
        pub_year = None
        for pub in m.get("publication_info", []):
            y = pub.get("year")
            if y and str(y).isdigit():
                pub_year = int(y)
                break
        if not pub_year:
            y = str(m.get("preprint_date", ""))[:4]
            pub_year = int(y) if y.isdigit() else None

        rec = {
            "Title": m.get("titles", [{}])[0].get("title", ""),
            "Date": pub_year,
            "DOI": m.get("dois", [{}])[0].get("value", ""),
            "ArXivID": m.get("arxiv_eprints", [{}])[0].get("value", ""),
            "Collaboration": ", ".join(c for c in collabs if c),
            "Citations": m.get("citation_count", 0),
            "DocumentType": ", ".join(m.get("document_type", [])),
            "Refereed": m.get("refereed", False),
            "Categories": ", ".join(cat.get("term", "") for cat in m.get("inspire_categories", [])),
            "ControlNumber": m.get("control_number"),
        }

        author_list = []
        for a in m.get("authors", []):
            affs = [
                aff.get("value", "")
                for aff in (a.get("affiliations", []) + a.get("raw_affiliations", []))
                if "value" in aff
            ]
            author_list.append({
                "name": a.get("full_name", ""),
                "affiliations": affs
            })
        rec["Authors"] = author_list

        records.append(rec)

    print(f"✅ Fetched metadata for: {identifier}")
    time.sleep(0.3)

print(f"\n✅ Total records collected: {len(records)}")


✅ Fetched metadata for: 10.1007/JHEP08(2025)075
✅ Fetched metadata for: 10.1016/j.nima.2024.170127
✅ Fetched metadata for: 10.1016/j.nima.2024.170171
✅ Fetched metadata for: 10.1016/j.physletb.2023.138101
✅ Fetched metadata for: 10.1088/1748-0221/18/06/C06024
✅ Fetched metadata for: 10.1103/6pmd-6dwr
✅ Fetched metadata for: 10.1103/PhysRevC.107.014907
✅ Fetched metadata for: 10.1103/PhysRevC.107.024907
✅ Fetched metadata for: 10.1103/PhysRevC.107.024914
✅ Fetched metadata for: 10.1103/PhysRevC.109.044912
✅ Fetched metadata for: 10.1103/PhysRevC.109.054910
✅ Fetched metadata for: 10.1103/PhysRevC.110.044901
✅ Fetched metadata for: 10.1103/PhysRevC.110.064905
✅ Fetched metadata for: 10.1103/PhysRevC.110.064909
✅ Fetched metadata for: 10.1103/PhysRevD.107.052012
✅ Fetched metadata for: 10.1103/PhysRevD.107.112004
✅ Fetched metadata for: 10.1103/PhysRevD.108.072016
✅ Fetched metadata for: 10.1103/PhysRevLett.130.251901
✅ Fetched metadata for: 10.1103/PhysRevLett.134.022302
✅ Fetched metada

In [7]:
if do_read_from_web:   
    # Save as CSV
    df = pd.DataFrame(records)
    df["Authors"] = df["Authors"].apply(json.dumps)  # store list of dicts safely
    df.to_csv("inspire_doi_metadata_full.csv", index=False)
    print("💾 Saved to inspire_doi_metadata_full.csv")

💾 Saved to inspire_doi_metadata_full.csv


In [8]:
if do_read_from_cvs:
    import pandas as pd, ast
    df = pd.read_csv("inspire_doi_metadata_full.csv")

    # Convert back to list of dicts
    df["Authors"] = df["Authors"].apply(ast.literal_eval)
    records = df.to_dict(orient="records")

    # Now this works again:
    #for a in records[0]["Authors"]:
    #    print(a["name"], a["affiliations"])

In [9]:
for rec in records:
    year_str = str(rec.get("Date", ""))[:4]
    if year_str.isdigit() and int(year_str) < 2023:
        print(f"{year_str}: {rec['Title']}")


In [10]:
def classify_doc_type(row):
    doi = str(row.get("DOI", "")).strip()
    doc_type = str(row.get("DocumentType", "")).lower()

    # --- pattern-based inference ---
    if any(tag in doi.lower() for tag in ["physrevlett", "physletb", "plb"]):
        inferred = "letter"
    elif re.match(r"^10\.1103/[A-Za-z0-9\-]{6,10}$", doi) and not re.search(r"physrev", doi, re.IGNORECASE):
        # short hashed DOI-like placeholder → preprint
        inferred = "preprint"
    elif "conference" in doc_type or "proceeding" in doc_type or "jphysconf" in doi.lower():
        inferred = "conference paper"
    else:
        inferred = "article"

    if not doc_type or doc_type in ["", "article", "paper"]:
        return inferred
    return doc_type

df["DocumentType_inferred"] = df.apply(classify_doc_type, axis=1)

print("✅ DOI classification preview:\n")
for doi, typ in zip(df["DOI"], df["DocumentType_inferred"]):
    print(f"{doi:50} → {typ}")

✅ DOI classification preview:

10.1007/JHEP08(2025)075                            → article
10.1016/j.nima.2024.170127                         → article
10.1016/j.nima.2024.170171                         → article
10.1016/j.physletb.2023.138101                     → letter
10.1088/1748-0221/18/06/C06024                     → conference paper
10.1103/6pmd-6dwr                                  → preprint
10.1103/PhysRevC.107.014907                        → article
10.1103/PhysRevC.107.024907                        → article
10.1103/PhysRevC.107.024914                        → article
10.1103/PhysRevC.109.044912                        → article
10.1103/PhysRevC.109.054910                        → article
10.1103/PhysRevC.110.044901                        → article
10.1103/PhysRevC.110.064905                        → article
10.1103/PhysRevC.110.064909                        → article
10.1103/PhysRevD.107.052012                        → article
10.1103/PhysRevD.107.112004                  

In [11]:
#printing titles of articel whih DocumentType_inferred is conference paper
for index, row in df.iterrows():
    if row["DocumentType_inferred"] == "conference paper":
        print(row["Title"])

The effects of a passive Bi-Polar Grid (BPG) on Ion Back-Flow (IBF) and resolution


In [12]:
df["Collaboration"].value_counts()

Collaboration
PHENIX        21
               7
H1             5
sPHENIX        2
CERES NA45     1
Name: count, dtype: int64

In [13]:
# Keywords that identify SBU affiliations
sbu_keys = ["stony brook", "suny stony brook", "state univ. of new york","suny","yitp"]

In [14]:
author_counter = Counter()
author_papers = {}  # optional: to track which papers belong to each author

for rec in records:
    title = rec["Title"]
    for author in rec["Authors"]:
        affs = author.get("affiliations", [])
        # Check if any affiliation matches SBU
        if any(any(key in aff.lower() for key in sbu_keys) for aff in affs):
            name = author["name"]
            author_counter[name] += 1
            author_papers.setdefault(name, []).append(title)

sbu_authors = author_counter
print(f"Found {len(author_counter)} authors with SBU affiliation")

Found 117 authors with SBU affiliation


In [15]:
authors_df = pd.DataFrame(
    author_counter.items(),
    columns=["Author", "Num_SBU_Papers"]
).sort_values("Num_SBU_Papers", ascending=False)

authors_df.head(20)

Unnamed: 0,Author,Num_SBU_Papers
6,"Drees, A.",30
27,"Park, S.",26
24,"Deshpande, A.",26
25,"Gal, C.",26
4,"David, G.",24
3,"Corliss, R.",23
7,"Esha, R.",23
9,"Firak, D.",23
5,"Dehmelt, K.",22
10,"Garg, P.",22


In [16]:
import re
from collections import defaultdict

def get_last_name(full_name):
    """Extract surname from 'Last, First' or 'First Last' formats."""
    if not full_name:
        return ""
    name = re.sub(r"[,]", " ", full_name).strip()
    parts = [p for p in name.split() if p]
    if not parts:
        return ""
    last = parts[0] if "," in full_name else parts[-1]
    last = re.sub(r"[^A-Za-z\-]", "", last).lower().strip()
    return last

def get_first_initial(full_name):
    """Return the first letter of the given name part (after comma or first token)."""
    if not full_name:
        return ""
    name = re.sub(r"[,]", " ", full_name).strip()
    parts = [p for p in name.split() if p]
    if not parts:
        return ""
    # if format 'Last, First'
    if "," in full_name:
        if len(parts) > 1:
            token = re.sub(r"[^A-Za-z]", "", parts[1])
            return token[0].lower() if token else ""
    # if 'First Last'
    elif len(parts) > 1:
        token = re.sub(r"[^A-Za-z]", "", parts[0])
        return token[0].lower() if token else ""
    return ""

# ------------------------------------------------------
# Combine authors: same last name + same first initial
# ------------------------------------------------------
author_groups = defaultdict(list)

for name in sorted(sbu_authors):
    last = get_last_name(name)
    first_init = get_first_initial(name)
    if last:
        key = f"{last}_{first_init}"  # e.g., "esha_r", "chen_c"
        author_groups[key].append(name)

# Show groups that contain multiple variants
for key, names in sorted(author_groups.items()):
    if len(names) > 1:
        print(f"{key}: {names}")

print(f"\nTotal unique (last+first-initial) combinations: {len(author_groups)}")


corliss_r: ['Corliss, R.', 'Corliss, Ross']
david_g: ['David, G.', 'David, Gabor']
esha_r: ['Esha, R.', 'Esha, Roli']
hemmick_t: ['Hemmick, T.', 'Hemmick, T.K.', 'Hemmick, Thomas']
jacak_b: ['Jacak, B.', 'Jacak, B.V.']
ramasubramanian_n: ['Ramasubramanian, N.', 'Ramasubramanian, N.V.']
shulga_e: ['Shulga, E.', 'Shulga, Evgeny']

Total unique (last+first-initial) combinations: 109


In [17]:
sbu_keys = [
    "stony brook",
    "suny stony brook",
    "state univ. of new york",
    "yitp",
    "yitp, stony brook",
    "stony brook u.",
    "suny, stony brook",
    "suny"
]

In [18]:
# ----------------------------------------------------
# Build unified author keys (last + first initial)
# ----------------------------------------------------
author_variants = defaultdict(list)
for name in sorted(sbu_authors):
    last = get_last_name(name)
    first_init = get_first_initial(name)
    if not last:
        continue
    key = f"{last}_{first_init}"
    author_variants[key].append(name)

sbu_authors_last = set(author_variants.keys())
print(f"Unified to {len(sbu_authors_last)} unique (last+first-initial) authors for DOE-matched list.")

# ----------------------------------------------------
# Count 2023+ SBU-affiliated papers for those authors
# ----------------------------------------------------
def year_from_record(rec):
    y = str(rec.get("Date", ""))[:4]
    return int(y) if y.isdigit() else None

records_recent = [r for r in records if (year_from_record(r) and year_from_record(r) >= 2023)]
print(f"Total records from 2023+: {len(records_recent)}")

author_recent_sbu = defaultdict(list)

for rec in records_recent:
    doi = rec.get("DOI", "").strip() or f"no_doi_{rec.get('Title','')[:120]}"
    for a in rec.get("Authors", []):
        last = get_last_name(a["name"])
        first_init = get_first_initial(a["name"])
        key = f"{last}_{first_init}"
        if key not in sbu_authors_last:
            continue

        # Collect all affiliation strings safely
        affs = []
        for aff_obj in (a.get("affiliations", []) + a.get("raw_affiliations", [])):
            if isinstance(aff_obj, dict) and "value" in aff_obj:
                affs.append(aff_obj["value"])
            elif isinstance(aff_obj, str):
                affs.append(aff_obj)

        # Match against known SBU patterns
        aff_joined = " ".join(affs).lower()
        if any(k.lower() in aff_joined for k in sbu_keys):
            author_recent_sbu[key].append(rec)

# ----------------------------------------------------
# Build DataFrame with classification by DOI pattern
# ----------------------------------------------------
def infer_doc_type(doi, doc_type):
    doi = (doi or "").lower()
    doc_type = (doc_type or "").lower()

    if any(tag in doi for tag in ["physrevlett", "physletb", "plb"]):
        return "letter"
    elif re.match(r"^10\.1103/[A-Za-z0-9\-]{6,10}$", doi) and not re.search(r"physrev", doi, re.IGNORECASE):
        return "preprint"
    elif "conference" in doc_type or "proceeding" in doc_type or "jphysconf" in doi:
        return "conference"
    return "article"

rows = []
for key, recs in author_recent_sbu.items():
    # Build a dictionary keyed by DOI, so duplicates collapse automatically
    unique_by_doi = {}

    for r in recs:
        doi = r.get("DOI", f"no_doi_{r.get('Title','')[:120]}")
        unique_by_doi[doi] = r   # overwrite duplicates safely

    # classify unique DOIs only
    num_letters = 0
    num_peer_reviewed = 0
    num_preprints = 0

    for doi, r in unique_by_doi.items():
        inferred = infer_doc_type(r.get("DOI"), r.get("DocumentType"))
        if inferred == "letter":
            num_letters += 1
        elif inferred in ["preprint", "preprint (possibly)"]:
            num_preprints += 1
        elif inferred == "article":
            num_peer_reviewed += 1

    unique_dois = set(unique_by_doi.keys())

    rows.append({
        "AuthorKey": key,
        "FullNames": ", ".join(sorted(set(author_variants.get(key, [])))),
        "Num_SBU_Papers_since2023": len(unique_dois),
        "Num_Letters": num_letters,
        "Num_PeerReviewed": num_peer_reviewed,
        "Num_Preprints": num_preprints,
        "DOIs": "; ".join(sorted(unique_dois))
    })

authors_recent_df = pd.DataFrame(rows).sort_values("Num_SBU_Papers_since2023", ascending=False)
authors_recent_df.to_csv("selected_authors_sbu_since2023.csv", index=False)

print(f"✅ Saved {len(authors_recent_df)} authors with SBU affiliation since 2023.")
authors_recent_df.head(40)


Unified to 109 unique (last+first-initial) authors for DOE-matched list.
Total records from 2023+: 36
✅ Saved 109 authors with SBU affiliation since 2023.


Unnamed: 0,AuthorKey,FullNames,Num_SBU_Papers_since2023,Num_Letters,Num_PeerReviewed,Num_Preprints,DOIs
6,drees_a,"Drees, A.",25,3,17,5,; 10.1007/JHEP08(2025)075; 10.1016/j.physletb....
21,gal_c,"Gal, C.",22,3,15,4,; 10.1016/j.physletb.2023.138101; 10.1103/6pmd...
23,park_s,"Park, S.",22,3,15,4,; 10.1016/j.physletb.2023.138101; 10.1103/6pmd...
11,hemmick_t,"Hemmick, T., Hemmick, T.K., Hemmick, Thomas",22,2,14,5,; 10.1007/JHEP08(2025)075; 10.1016/j.nima.2024...
20,deshpande_a,"Deshpande, A.",22,3,15,4,; 10.1016/j.physletb.2023.138101; 10.1103/6pmd...
3,corliss_r,"Corliss, R., Corliss, Ross",21,2,14,5,; 10.1007/JHEP08(2025)075; 10.1016/j.nima.2024...
4,david_g,"David, G., David, Gabor",21,2,14,5,; 10.1007/JHEP08(2025)075; 10.1016/j.nima.2024...
7,esha_r,"Esha, R., Esha, Roli",21,2,14,5,; 10.1007/JHEP08(2025)075; 10.1103/6pmd-6dwr; ...
9,firak_d,"Firak, D.",20,2,13,5,; 10.1007/JHEP08(2025)075; 10.1103/6pmd-6dwr; ...
10,garg_p,"Garg, P.",19,2,11,5,; 10.1007/JHEP08(2025)075; 10.1088/1748-0221/1...


In [19]:
target = "hemmick_t"  # author key to focus on (lastname_firstinitial)

for rec in records_recent:
    for a in rec["Authors"]:
        key = f"{get_last_name(a['name'])}_{get_first_initial(a['name'])}"
        if key == target:
            affs = []
            for aff in a.get("affiliations", []) + a.get("raw_affiliations", []):
                if isinstance(aff, dict) and "value" in aff:
                    affs.append(aff["value"])
                elif isinstance(aff, str):
                    affs.append(aff)
            aff_joined = " ".join(affs).lower()
            match = any(k.lower() in aff_joined for k in sbu_keys)
            print(
                f"{a['name']:25} | "
                f"{'✅ match' if match else '❌ missing'} | "
                f"{rec.get('Title', '')[:80]}"
            )
            if not match:
                print("   affiliations:", affs)


Hemmick, T.               | ✅ match | Measurement of charged hadron multiplicity in Au+Au collisions at $ \sqrt{{\text
Hemmick, Thomas           | ✅ match | Fast spark-detection system for GEM detectors
Hemmick, T.               | ✅ match | The effects of a passive Bi-Polar Grid (BPG) on Ion Back-Flow (IBF) and resoluti
Hemmick, T.K.             | ✅ match | Measurement of elliptic flow of <math><mrow><mi>J</mi><mo>/</mo><mi>ψ</mi></mrow
Hemmick, T.K.             | ✅ match | Measurement of <math><mi>ϕ</mi></math>-meson production in <math><mrow><mi>Cu</m
Hemmick, T.K.             | ✅ match | Measurements of second-harmonic Fourier coefficients from azimuthal anisotropies
Hemmick, T.K.             | ✅ match | Low-<math><msub><mi>p</mi><mi>T</mi></msub></math> direct-photon production in <
Hemmick, T.K.             | ✅ match | Nonprompt direct-photon production in <math><mrow><mtext>Au</mtext><mo>+</mo><mt
Hemmick, T.K.             | ✅ match | Identified charged-hadron production in <math

In [20]:
import requests, json

r = requests.get("https://inspirehep.net/api/literature/2645172")
m = r.json()["metadata"]

# print the entire metadata nicely formatted
print(json.dumps(m, indent=2))


{
  "citation_count": 23,
  "citation_count_without_self_citations": 9,
  "publication_info": [
    {
      "year": 2025,
      "artid": "022302",
      "material": "publication",
      "journal_issue": "2",
      "journal_title": "Phys.Rev.Lett.",
      "journal_record": {
        "$ref": "https://inspirehep.net/api/journals/1214495"
      },
      "journal_volume": "134",
      "pubinfo_freetext": "Phys. Rev. Lett. 134, 022302 (2025)"
    }
  ],
  "core": true,
  "dois": [
    {
      "value": "10.1103/PhysRevLett.134.022302",
      "source": "APS",
      "material": "publication"
    },
    {
      "value": "10.1103/PhysRevLett.134.022302",
      "source": "arXiv",
      "material": "publication"
    }
  ],
  "urls": [
    {
      "value": "https://www.bnl.gov/newsroom/news.php?a=122267",
      "description": "Brookhaven News article"
    }
  ],
  "titles": [
    {
      "title": "Disentangling Centrality Bias and Final-State Effects in the Production of High-<math display=\"inline\

In [21]:
# Load the two CSV databases
all_df = pd.read_csv("authors_sbu_since2023.csv")
sel_df = pd.read_csv("selected_authors_sbu_since2023.csv")

# Convert DOI strings to sets for easy comparison
def parse_dois(doi_str):
    if pd.isna(doi_str):
        return set()
    return set(d.strip() for d in str(doi_str).split(";") if d.strip())

all_df["DOI_set"] = all_df["DOIs"].apply(parse_dois)
sel_df["DOI_set"] = sel_df["DOIs"].apply(parse_dois)

# Convert to dicts keyed by AuthorKey for fast lookup
all_dict = all_df.set_index("AuthorKey")["DOI_set"].to_dict()
sel_dict = sel_df.set_index("AuthorKey")["DOI_set"].to_dict()

# Compare for each selected author
missing_records = []
for author_key, sel_dois in sel_dict.items():
    all_dois = all_dict.get(author_key, set())
    missing = all_dois - sel_dois
    if missing:
        missing_records.append({
            "AuthorKey": author_key,
            "Missing_Count": len(missing),
            "Missing_DOIs": "; ".join(sorted(missing))
        })

missing_df = pd.DataFrame(missing_records).sort_values("Missing_Count", ascending=False)
missing_df.to_csv("missing_articles_by_author.csv", index=False)

print(f"✅ Saved comparison: {len(missing_df)} authors have missing articles.")
print(missing_df.head(20))


✅ Saved comparison: 73 authors have missing articles.
        AuthorKey  Missing_Count  \
54          sun_c             36   
4     deshpande_a             23   
68     bernauer_j             21   
71        cline_e             18   
2          park_s             12   
72        datta_j              8   
37       mondal_m              4   
7          esha_r              4   
11      dehmelt_k              4   
13       doomra_v              3   
1           gal_c              3   
9          garg_p              3   
23  khachatryan_v              3   
60         chen_c              2   
8         firak_d              2   
6         david_g              2   
66          wei_r              1   
44       cronin_n              1   
45    cervantes_r              1   
46        dixit_d              1   

                                         Missing_DOIs  
54  10.1007/JHEP06(2023)176; 10.1007/JHEP10(2024)1...  
4   10.1007/s41781-024-00113-4; 10.1016/j.nuclphys...  
68  10.1007/s41781-02

In [22]:
author = "drees_a"
all_dois = all_dict.get(author, set())
sel_dois = sel_dict.get(author, set())
missing = all_dois - sel_dois

print(f"Missing DOIs for {author}:")
for d in sorted(missing):
    print("  ", d)


Missing DOIs for drees_a:
   10.5506/APhysPolBSupp.16.1-A7
