In [1]:
import requests, json, time
import pandas as pd
from collections import Counter, defaultdict

In [2]:
do_read_from_web = True
do_read_from_cvs = not do_read_from_web

In [3]:
base_url = "https://inspirehep.net/api/literature"
url = base_url
query = (
    '(aff:"Stony Brook U." OR aff:"SUNY, Stony Brook" OR aff:"YITP, Stony Brook" '
    'OR fulltext:"Stony Brook University" OR fulltext:"stony brook") '
    'and date 2023->2025'
)
exclude_collabs = {"ATLAS", "CMS"}

params = {
    "q": query,
    "size": 100,          # bigger pages = fewer requests
    "sort": "mostrecent",
    "cursor": "*",        # enable cursor mode
}

records = []
page = 1

In [4]:
if do_read_from_web:
    while True:
        try:
            r = requests.get(url, params=params if page == 1 else None, timeout=60)
            r.raise_for_status()
            data = r.json()
        except json.JSONDecodeError:
            print(f"⚠️  JSONDecodeError on page {page}, retrying smaller page size...")
            params["size"] = 50
            time.sleep(1)
            continue
        except requests.RequestException as e:
            print(f"❌ Request failed on page {page}: {e}")
            break

        hits = data.get("hits", {}).get("hits", [])
        if not hits:
            break

        for hit in hits:
            m = hit.get("metadata", {})
            collabs = [c.get("value", "") for c in m.get("collaborations", [])]
            if any(c in exclude_collabs for c in collabs):
                continue

            rec = {
                "Title": m.get("titles", [{}])[0].get("title", ""),
                "Date": m.get("preprint_date") or m.get("publication_info", [{}])[0].get("year"),
                "DOI": m.get("dois", [{}])[0].get("value", ""),
                "Collaboration": ", ".join(c for c in collabs if c),
                "Citations": m.get("citation_count", 0),
                "DocumentType": ", ".join(m.get("document_type", [])),
                "Refereed": m.get("refereed", False),
                "ArXivID": m.get("arxiv_eprints", [{}])[0].get("value", ""),
                "Categories": ", ".join(cat.get("term", "") for cat in m.get("inspire_categories", [])),
                "ControlNumber": m.get("control_number"),
            }

            author_list = []
            for a in m.get("authors", []):
                affs = [
                    aff.get("value", "")
                    for aff in (a.get("affiliations", []) + a.get("raw_affiliations", []))
                    if "value" in aff
                ]
                author_list.append({
                    "name": a.get("full_name", ""),
                    "affiliations": affs
                })
            rec["Authors"] = author_list

            records.append(rec)

        print(f"Fetched batch {page} with {len(hits)} records...")
        page += 1
        time.sleep(0.3)

        next_url = data.get("links", {}).get("next")
        if not next_url:
            break
        url = next_url

    print(f"✅ Total records collected: {len(records)}")

Fetched batch 1 with 100 records...
Fetched batch 2 with 100 records...
Fetched batch 3 with 100 records...
Fetched batch 4 with 100 records...
Fetched batch 5 with 100 records...
Fetched batch 6 with 100 records...
Fetched batch 7 with 100 records...
Fetched batch 8 with 100 records...
Fetched batch 9 with 100 records...
Fetched batch 10 with 100 records...
Fetched batch 11 with 100 records...
Fetched batch 12 with 100 records...
Fetched batch 13 with 100 records...
Fetched batch 14 with 100 records...
Fetched batch 15 with 100 records...
Fetched batch 16 with 100 records...
Fetched batch 17 with 100 records...
Fetched batch 18 with 100 records...
Fetched batch 19 with 100 records...
Fetched batch 20 with 100 records...
Fetched batch 21 with 100 records...
Fetched batch 22 with 100 records...
Fetched batch 23 with 100 records...
Fetched batch 24 with 100 records...
Fetched batch 25 with 89 records...
✅ Total records collected: 2109


In [5]:
import requests, json

r = requests.get("https://inspirehep.net/api/literature/2865177")
m = r.json()["metadata"]

# print the entire metadata nicely formatted
print(json.dumps(m, indent=2))


{
  "citation_count": 0,
  "publication_info": [
    {
      "year": 2025,
      "artid": "170171",
      "material": "publication",
      "journal_title": "Nucl.Instrum.Meth.A",
      "journal_record": {
        "$ref": "https://inspirehep.net/api/journals/1613981"
      },
      "journal_volume": "1072"
    }
  ],
  "citation_count_without_self_citations": 0,
  "core": true,
  "dois": [
    {
      "value": "10.1016/j.nima.2024.170171",
      "source": "Elsevier B.V.",
      "material": "publication"
    }
  ],
  "titles": [
    {
      "title": "Fast spark-detection system for GEM detectors",
      "source": "Elsevier B.V."
    }
  ],
  "$schema": "https://inspirehep.net/schemas/records/hep.json",
  "authors": [
    {
      "uuid": "9abfbf8d-5c91-41d5-afa6-38890ac7ebbd",
      "record": {
        "$ref": "https://inspirehep.net/api/authors/2865178"
      },
      "full_name": "Baranyai, David",
      "affiliations": [
        {
          "value": "Debrecen U.",
          "record": {

In [6]:
if do_read_from_web: 
    
    with open("inspire_full_sbu_2022_2025.json", "w") as f:
        json.dump(records, f, indent=2)
    
    # Optionally also make a simplified DataFrame for quick viewing
    df = pd.DataFrame([{
        "Title": r["Title"],
        "Date": r["Date"],
        "Collaboration": r["Collaboration"],
        "Citations": r["Citations"],
        "NumAuthors": len(r["Authors"])
    } for r in records])
    
    df.to_csv("inspire_full_sbu_2022_2025.csv", index=False)
    df.head()

In [7]:
if do_read_from_cvs:
    df = pd.read_csv("inspire_sbu_2022_2025.csv")
    print(f"Total records from CSV: {len(df)}")
    df.head(10)
    with open("inspire_full_sbu_2022_2025.json") as f:
        records = json.load(f)

In [8]:
df["Collaboration"].value_counts()

Collaboration
                                 1615
IceCube                           160
STAR                               55
PHENIX                             27
DES                                26
                                 ... 
DES, FERMI-LAT                      1
VERITAS, IceCube                    1
Simons Observatory                  1
Double Chooz                        1
LIGO Scientific, Virgo, VIRGO       1
Name: count, Length: 104, dtype: int64

In [9]:
# Keep only records with collaboration containing 'PHENIX' or 'sPHENIX'
df_filtered = df[df['Collaboration'].str.contains(r'\b(s?PHENIX)\b', case=False, na=False)]

# Optional: reset index for cleanliness
df_filtered = df_filtered.reset_index(drop=True)

# Display
print(f"Filtered to {len(df_filtered)} PHENIX/sPHENIX records")
df_filtered.head(10)

Filtered to 29 PHENIX/sPHENIX records


  df_filtered = df[df['Collaboration'].str.contains(r'\b(s?PHENIX)\b', case=False, na=False)]


Unnamed: 0,Title,Date,Collaboration,Citations,NumAuthors
0,Transverse single-spin asymmetry of forward $η...,2025-09-16,PHENIX,0,383
1,Cross sections of $\eta$ mesons in $p$ $+$ $p$...,2025-07-07,PHENIX,1,500
2,Low-mass vector-meson production at forward ra...,2025-07-06,PHENIX,0,355
3,Measurement of inclusive jet cross section and...,2025-06-15,PHENIX,1,447
4,Azimuthal anisotropy of direct photons in Au$+...,2025-04-03,PHENIX,2,325
5,Measurement of charged hadron multiplicity in ...,2025-04-02,sPHENIX,1,305
6,Measurement of the transverse energy density i...,2025-04-02,sPHENIX,0,305
7,Measurements at forward rapidity of elliptic f...,2024-09-19,PHENIX,0,370
8,Measurement of elliptic flow of <math><mrow><m...,2024-09-19,PHENIX,2,370
9,"Multiplicity dependent <math display=""inline"">...",2024-09-05,PHENIX,6,303


In [10]:
# Keywords that identify SBU affiliations
sbu_keys = ["stony brook", "suny stony brook", "state univ. of new york"]

In [11]:
author_counter = Counter()
author_papers = {}  # optional: to track which papers belong to each author

for rec in records:
    title = rec["Title"]
    for author in rec["Authors"]:
        affs = author.get("affiliations", [])
        # Check if any affiliation matches SBU
        if any(any(key in aff.lower() for key in sbu_keys) for aff in affs):
            name = author["name"]
            author_counter[name] += 1
            author_papers.setdefault(name, []).append(title)

print(f"Found {len(author_counter)} authors with SBU affiliation")

Found 828 authors with SBU affiliation


In [25]:
authors_df = pd.DataFrame(
    author_counter.items(),
    columns=["Author", "Num_SBU_Papers"]
).sort_values("Num_SBU_Papers", ascending=False)

authors_df.head(20)

Unnamed: 0,Author,Num_SBU_Papers
74,"Chen, Z.",64
75,"Kiryluk, J.",64
76,"Zhang, Z.",64
401,"Hamdaoui, H.",52
175,"Jia, Jiangyong",47
4,"Zahed, Ismail",47
142,"Deshpande, A.",46
214,"Wei, Tzu-Chieh",38
70,"Park, S.",37
7,"Wang, Jin",35


In [32]:
authors_df[authors_df["Author"].isin(['Hemmick, T.', 'Hemmick, T.K.', 'Hemmick, Thomas'])]

Unnamed: 0,Author,Num_SBU_Papers
153,"Hemmick, T.K.",23
351,"Hemmick, T.",3
426,"Hemmick, Thomas",1


In [13]:
authors_full = pd.DataFrame([
    {"Author": a, 
     "Num_SBU_Papers": len(set(author_papers[a])),
     "Titles": "; ".join(sorted(set(author_papers[a])))}
    for a in author_papers
]).sort_values("Num_SBU_Papers", ascending=False)

authors_full.to_csv("inspire_authors_sbu_summary.csv", index=False)

In [14]:
import fitz  # PyMuPDF
import re
pdf_file_name = "/home/yoren/yumvd.Yandex.Disk/Yura/Personal/grants/DOE_progress_report_publications_2023_2026.pdf"

In [15]:
import fitz
import re

pdf_path = pdf_file_name


# read all text
doc = fitz.open(pdf_path)
text = " ".join(page.get_text("text") for page in doc)

# --- normalize all hyphen / dash variants ---
dash_chars = [
    "\u00ad",  # soft hyphen
    "\u2010",  # hyphen
    "\u2011",  # non-breaking hyphen
    "\u2012",  # figure dash
    "\u2013",  # en dash
    "\u2014",  # em dash
    "\u2015",  # horizontal bar
    "\u2212",  # minus sign
]
for ch in dash_chars:
    text = text.replace(ch, "-")

# remove newlines inside DOIs
text = re.sub(r'\s*-\s*(?=\d)', '-', text)  # fix broken numeric segments
text = re.sub(r'\s*\n\s*', ' ', text)        # flatten newlines

# --- extract DOIs ---
pattern = r'10\.\d{4,9}/[A-Za-z0-9.\-()/]+'
dois = re.findall(pattern, text)

dois = sorted(set(dois))
print(f"Found {len(dois)} DOIs:")
for d in dois:
    print(d)


Found 26 DOIs:
10.1007/JHEP08(2025)075
10.1016/j.nima.2024.170127
10.1016/j.nima.2024.170171
10.1016/j.physletb.2023.138101
10.1088/1748-0221/18/06/C06024
10.1103/6pmd-6dwr
10.1103/PhysRevC.107.014907
10.1103/PhysRevC.107.024907
10.1103/PhysRevC.107.024914
10.1103/PhysRevC.109.044912
10.1103/PhysRevC.109.054910
10.1103/PhysRevC.110.044901
10.1103/PhysRevC.110.064905
10.1103/PhysRevC.110.064909
10.1103/PhysRevD.107.052012
10.1103/PhysRevD.107.112004
10.1103/PhysRevD.108.072016
10.1103/PhysRevLett.130.251901
10.1103/PhysRevLett.134.022302
10.1103/h8d5-swg6
10.1103/hpm9-
10.1103/ptpm-jtt8
10.1140/epjc/s10052-024-12987-0
10.1140/epjc/s10052-024-13003-1
10.1140/epjc/s10052-024-13115-8
10.1140/epjc/s10052-024-13416-y


In [16]:
import re

# --- Helper: normalize DOIs to a consistent comparable form ---
def normalize_doi(d):
    """Lowercase, strip, and remove common prefixes."""
    if not d:
        return ""
    d = d.lower().strip()
    d = d.replace("https://doi.org/", "")
    d = d.replace("http://doi.org/", "")
    d = d.replace("doi:", "")
    d = re.sub(r"\s+", "", d)   # remove spaces and line breaks
    return d

# --- Normalize DOE list of DOIs ---
doe_dois = [normalize_doi(d) for d in dois]

# --- Define Stony Brook affiliation keywords ---
sbu_keys = [
    "stony brook",
    "suny stony brook",
    "state univ. of new york",
    "yitp",
    "yitp, stony brook",
    "stony brook u.",
    "suny, stony brook",
    "suny"
]

# --- Step 1: find all matched records by normalized DOI ---
matched_records = []
for r in records:
    rec_doi = normalize_doi(r.get("DOI", ""))
    if rec_doi and rec_doi in doe_dois:
        matched_records.append(r)

print(f"Matched {len(matched_records)} publications between DOE list and database.")

# --- Diagnostic: find unmatched DOIs (to catch hidden mismatches) ---
unmatched_dois = [
    d for d in doe_dois
    if not any(d == normalize_doi(r.get("DOI", "")) for r in records)
]
if unmatched_dois:
    print(f"\n⚠️  {len(unmatched_dois)} DOIs from DOE list not found exactly in records. Close matches:")
    for d in unmatched_dois:
        close = [
            r.get("DOI", "") for r in records
            if normalize_doi(d) in normalize_doi(r.get("DOI", "")) or normalize_doi(r.get("DOI", "")) in normalize_doi(d)
        ]
        print(" ", d, "→", close)
else:
    print("✅ All DOIs matched successfully.")

# --- Step 2: extract all authors with at least one SBU affiliation ---
sbu_authors = set()

for rec in matched_records:
    for a in rec.get("Authors", []):
        affs = a.get("affiliations", [])
        # If any affiliation string matches SBU keywords
        if any(any(k in aff.lower() for k in sbu_keys) for aff in affs):
            sbu_authors.add(a["name"])

print(f"\nFound {len(sbu_authors)} unique authors with SBU affiliation in matched publications.")
for name in sorted(sbu_authors):
    print("-", name)


Matched 25 publications between DOE list and database.

⚠️  1 DOIs from DOE list not found exactly in records. Close matches:
  10.1103/hpm9- → ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '10.110

In [17]:
import re
from collections import defaultdict

def get_last_name(full_name):
    """Extract surname from 'Last, First' or 'First Last' formats."""
    if not full_name:
        return ""
    name = re.sub(r"[,]", " ", full_name).strip()
    parts = [p for p in name.split() if p]
    if not parts:
        return ""
    last = parts[0] if "," in full_name else parts[-1]
    last = re.sub(r"[^A-Za-z\-]", "", last).lower().strip()
    return last

def get_first_initial(full_name):
    """Return the first letter of the given name part (after comma or first token)."""
    if not full_name:
        return ""
    name = re.sub(r"[,]", " ", full_name).strip()
    parts = [p for p in name.split() if p]
    if not parts:
        return ""
    # if format 'Last, First'
    if "," in full_name:
        if len(parts) > 1:
            token = re.sub(r"[^A-Za-z]", "", parts[1])
            return token[0].lower() if token else ""
    # if 'First Last'
    elif len(parts) > 1:
        token = re.sub(r"[^A-Za-z]", "", parts[0])
        return token[0].lower() if token else ""
    return ""

# ------------------------------------------------------
# Combine authors: same last name + same first initial
# ------------------------------------------------------
author_groups = defaultdict(list)

for name in sorted(sbu_authors):
    last = get_last_name(name)
    first_init = get_first_initial(name)
    if last:
        key = f"{last}_{first_init}"  # e.g., "esha_r", "chen_c"
        author_groups[key].append(name)

# Show groups that contain multiple variants
for key, names in sorted(author_groups.items()):
    if len(names) > 1:
        print(f"{key}: {names}")

print(f"\nTotal unique (last+first-initial) combinations: {len(author_groups)}")


corliss_r: ['Corliss, R.', 'Corliss, Ross']
david_g: ['David, G.', 'David, Gabor']
esha_r: ['Esha, R.', 'Esha, Roli']
hemmick_t: ['Hemmick, T.', 'Hemmick, T.K.', 'Hemmick, Thomas']
jacak_b: ['Jacak, B.', 'Jacak, B.V.']
shulga_e: ['Shulga, E.', 'Shulga, Evgeny']

Total unique (last+first-initial) combinations: 102


In [42]:
# ----------------------------------------------------
# Build unified author keys (last + first initial)
# ----------------------------------------------------
author_variants = defaultdict(list)
for name in sorted(sbu_authors):
    last = get_last_name(name)
    first_init = get_first_initial(name)
    if not last:
        continue
    key = f"{last}_{first_init}"
    author_variants[key].append(name)

sbu_authors_last = set(author_variants.keys())
print(f"Unified to {len(sbu_authors_last)} unique (last+first-initial) authors for DOE-matched list.")

# ----------------------------------------------------
# Count 2023+ SBU-affiliated papers for those authors
# ----------------------------------------------------
def year_from_record(rec):
    y = str(rec.get("Date", ""))[:4]
    return int(y) if y.isdigit() else None

records_recent = [r for r in records if (year_from_record(r) and year_from_record(r) >= 2023)]
print(f"Total records from 2023+: {len(records_recent)}")

author_recent_sbu = defaultdict(list)

for rec in records_recent:
    doi = rec.get("DOI", "").strip() or f"no_doi_{rec.get('Title','')[:30]}"
    for a in rec.get("Authors", []):
        last = get_last_name(a["name"])
        first_init = get_first_initial(a["name"])
        key = f"{last}_{first_init}"
        if key not in sbu_authors_last:
            continue

        # 🔹 Collect all affiliation strings safely
        affs = []
        for aff_obj in (a.get("affiliations", []) + a.get("raw_affiliations", [])):
            if isinstance(aff_obj, dict) and "value" in aff_obj:
                affs.append(aff_obj["value"])
            elif isinstance(aff_obj, str):
                affs.append(aff_obj)

        # 🔹 Match against lowercase variants of known SBU keywords
        aff_joined = " ".join(affs).lower()
        if any(k.lower() in aff_joined for k in sbu_keys):
            author_recent_sbu[key].append(doi)



# ----------------------------------------------------
# Build DataFrame
# ----------------------------------------------------
authors_recent_df = pd.DataFrame([
    {
        "AuthorKey": key,
        "FullNames": ", ".join(sorted(set(author_variants.get(key, [])))),
        "Num_SBU_Papers_since2023": len(set(dois)),
        "DOIs": "; ".join(sorted(set(dois)))
    }
    for key, dois in author_recent_sbu.items()
]).sort_values("Num_SBU_Papers_since2023", ascending=False)

authors_recent_df.to_csv("authors_sbu_since2023.csv", index=False)

print(f"Saved {len(authors_recent_df)} authors with SBU affiliation since 2023.")
authors_recent_df.head(40)

Unified to 102 unique (last+first-initial) authors for DOE-matched list.
Total records from 2023+: 1746
Saved 101 authors with SBU affiliation since 2023.


Unnamed: 0,AuthorKey,FullNames,Num_SBU_Papers_since2023,DOIs
12,deshpande_a,"Deshpande, A.",42,10.1007/s41781-024-00113-4; 10.1016/j.nuclphys...
82,sun_c,"Sun, C.",29,10.1007/JHEP06(2023)176; 10.1007/JHEP10(2024)1...
1,park_s,"Park, S.",27,10.1016/j.nuclphysa.2024.122874; 10.1016/j.phy...
14,drees_a,"Drees, A.",24,10.1007/JHEP08(2025)075; 10.1016/j.physletb.20...
10,david_g,"David, G., David, Gabor",21,10.1007/JHEP08(2025)075; 10.1016/j.nima.2024.1...
17,gal_c,"Gal, C.",21,10.1016/j.physletb.2023.138101; 10.1103/6pmd-6...
15,esha_r,"Esha, R., Esha, Roli",20,10.1007/JHEP06(2023)176; 10.1007/JHEP08(2025)0...
73,bernauer_j,"Bernauer, J.C.",19,10.1007/JHEP08(2025)075; 10.1007/s41781-024-00...
8,corliss_r,"Corliss, R., Corliss, Ross",18,10.1007/JHEP08(2025)075; 10.1016/j.nima.2024.1...
11,dehmelt_k,"Dehmelt, K.",18,10.1007/JHEP08(2025)075; 10.1088/1748-0221/18/...


In [41]:
target = "hemmick_t"  # author key to focus on (lastname_firstinitial)

for rec in records_recent:
    for a in rec["Authors"]:
        key = f"{get_last_name(a['name'])}_{get_first_initial(a['name'])}"
        if key == target:
            affs = []
            for aff in a.get("affiliations", []) + a.get("raw_affiliations", []):
                if isinstance(aff, dict) and "value" in aff:
                    affs.append(aff["value"])
                elif isinstance(aff, str):
                    affs.append(aff)
            aff_joined = " ".join(affs).lower()
            match = any(k.lower() in aff_joined for k in sbu_keys)
            print(
                f"{a['name']:25} | "
                f"{'✅ match' if match else '❌ missing'} | "
                f"{rec.get('Title', '')[:80]}"
            )
            if not match:
                print("   affiliations:", affs)




Hemmick, T.K.             | ✅ match | Transverse single-spin asymmetry of forward $η$ mesons in $p^{\uparrow}+ p$ coll
Hemmick, T.K.             | ✅ match | Cross sections of $\eta$ mesons in $p$ $+$ $p$ collisions at forward rapidity at
Hemmick, T.K.             | ✅ match | Low-mass vector-meson production at forward rapidity in $p$$+$$p$ and Au$+$Au co
Hemmick, T.K.             | ✅ match | Measurement of inclusive jet cross section and substructure in <math display="in
Hemmick, T.K.             | ✅ match | Azimuthal anisotropy of direct photons in Au$+$Au collisions at $\sqrt{s_{_{NN}}
Hemmick, T.               | ✅ match | Measurement of charged hadron multiplicity in Au+Au collisions at $ \sqrt{{\text
Hemmick, T.               | ✅ match | Measurement of the transverse energy density in <math><mrow><mi>Au</mi><mo>+</mo
Hemmick, Thomas           | ✅ match | Fast spark-detection system for GEM detectors
Hemmick, T.K.             | ✅ match | Measurements at forward rapidity of elliptic 