### Create `openalex.works.work_references` table (if not exists)

In [0]:
%sql
-- looks like a few fields have been added after - citing_work_id, cited_work_id, etc.
CREATE TABLE IF NOT EXISTS openalex.works.work_references 
CLUSTER BY AUTO
AS
  WITH references_exploded AS (
    SELECT 
      native_id, 
      native_id_namespace,
      work_id as citing_work_id,
      provenance,
      posexplode(references) as (ref_ind, ref)
    FROM openalex.works.locations_mapped
  )
  SELECT
    native_id, 
    native_id_namespace,
    citing_work_id,
    ref_ind,
    ref.doi as doi,
    ref.pmid as pmid,
    ref.arxiv as arxiv,
    ref.title as title,
    ref.authors as authors,
    ref.year as year,
    ref.raw as raw,
    provenance,
    current_timestamp() as created_timestamp,
    current_timestamp() as updated_timestamp
  FROM references_exploded


### Insert fresh records for Parsing into `work_references`

In [0]:
%sql
-- Insert new references into work_references table
-- Strategy: INSERT ONLY for new (citing_work_id, ref_ind) combinations
-- Rationale: References don't change often; avoids data shift issues
-- For full re-processing of a work, delete its references first, then insert
-- 10/23 
    -- 84,848,991 records inserted
    -- 3,758,687,070 total records
    -- 1,526,343,813 cited_work_id is NULL

INSERT INTO openalex.works.work_references (
  native_id,
  native_id_namespace,
  citing_work_id,
  cited_work_id,
  ref_ind,
  doi,
  pmid,
  arxiv,
  title,
  normalized_title,
  authors,
  year,
  raw,
  parsed_doi,
  parsed_first_author,
  parsed_title,
  title_author,
  provenance,
  created_timestamp,
  updated_timestamp
)
WITH works_to_process AS (
  SELECT 
    lm.native_id,
    lm.native_id_namespace,
    lm.work_id as citing_work_id,
    lm.provenance,
    posexplode(lm.references) as (ref_ind, ref)
  FROM openalex.works.locations_mapped lm
  LEFT ANTI JOIN openalex.works.work_references wr
    ON lm.work_id = wr.citing_work_id
)
SELECT 
  native_id,
  native_id_namespace,
  citing_work_id,
  CAST(null as BIGINT), -- cited_work_id (calculated later)
  ref_ind,
  ref.doi as doi,
  ref.pmid as pmid,
  ref.arxiv as arxiv,
  ref.title as title,
  CAST(null as STRING), -- normalized_title (calculated later)
  ref.authors as authors,
  ref.year as year,
  ref.raw as raw,
  CAST(null as STRING), -- parsed_doi (calculated later)
  CAST(null as STRING), -- parsed_first_author (calculated later)
  CAST(null as STRING), -- parsed_title (calculated later)
  CAST(null as STRING), -- title_author (calculated later)
  provenance,
  current_timestamp() as created_timestamp,
  current_timestamp() as updated_timestamp
FROM works_to_process;
-- For full re-processing of specific works, run this first:
-- DELETE FROM openalex.works.work_references
-- WHERE citing_work_id IN (SELECT work_id FROM openalex.works.locations_mapped WHERE ...);

### Merge with `work_id_map.doi`

In [0]:
%sql
-- ============================================================
-- STEP 2: Link references to cited works via DOI matching
-- ============================================================
-- Update cited_work_id and title_author for references where DOI matches work_id_map
-- Prefer paper_id, fall back to id within work_id_map
-- Only updates records where cited_work_id is still NULL
-- 10/23 added 29,913,749 IDs for 84,848,991 new records, recovered 39,496 pmid

MERGE INTO openalex.works.work_references AS target
USING (
  SELECT 
    LOWER(doi) as doi,
    MIN(paper_id) as paper_id,
    MIN(id) as work_id,
    MIN(pmid) as pmid,
    MAX(title_author) as title_author 
  FROM openalex.works.work_id_map
  WHERE doi IS NOT NULL
  GROUP BY lower(doi)
) AS source
ON lower(target.doi) = source.doi
WHEN MATCHED AND target.cited_work_id IS NULL
THEN UPDATE SET
  target.cited_work_id = COALESCE(source.paper_id, source.work_id),
  target.pmid = COALESCE(source.pmid, target.pmid), -- bring it in if exists
  target.title_author = COALESCE(source.title_author, target.title_author), -- bring it in if exists
  target.updated_timestamp = current_timestamp();

### Merge with `work_id_map.pmid`

In [0]:
%sql
-- 10/23 added 11,311,131 IDs (where doi was null) for 84,848,991 new records 
MERGE INTO openalex.works.work_references AS target
USING (
  SELECT DISTINCT
    lower(pmid) as pmid,
    MIN(paper_id) AS paper_id,
    MIN(id) as work_id,
    MAX(title_author) as title_author
  FROM openalex.works.work_id_map
  WHERE pmid IS NOT NULL and doi is NULL -- keep doi is null because it otherwise adds a lot of erroneous refs
  GROUP BY pmid
) as source
ON lower(target.pmid) = source.pmid
WHEN MATCHED AND target.cited_work_id IS NULL
THEN UPDATE SET
  target.cited_work_id = COALESCE(source.paper_id, source.work_id),
  target.title_author = COALESCE(source.title_author, target.title_author), -- bring it in if exists
  target.updated_timestamp = current_timestamp();

### Merge with `work_id_map.title_author`

In [0]:
%sql
-- 10/23 added 11,311,131 IDs for 84,848,991 new records
MERGE INTO openalex.works.work_references AS target
USING (
  SELECT DISTINCT
    lower(pmid) as pmid,
    MIN(paper_id) AS paper_id,
    MIN(id) as work_id,
    MAX(title_author) as title_author
  FROM openalex.works.work_id_map
  WHERE pmid IS NOT NULL AND doi is NULL
  GROUP BY pmid
) as source
ON lower(target.pmid) = source.pmid
WHEN MATCHED AND target.cited_work_id IS NULL
THEN UPDATE SET
  target.cited_work_id = COALESCE(source.paper_id, source.work_id),
  target.title_author = COALESCE(source.title_author, target.title_author), -- bring it in if exists
  target.updated_timestamp = current_timestamp();

In [0]:
%sql
  SELECT provenance, native_id, native_id_namespace, references
  FROM openalex.works.locations_parsed
  WHERE references is not null and size(references) > 0
  AND NOT(size(references) = 1 and references[0].doi is null 
    and references[0].pmid is null and references[0].title is null and references[0].arxiv is null and references[0].raw is null)

In [0]:
%sql
WITH filtered_refs AS (
  SELECT provenance, native_id, native_id_namespace, 
    references,
    size(array_compact(references.doi)) as num_dois,
    size(array_compact(references.pmid)) as num_pmids,
    size(array_compact(references.title)) as num_titles,
    size(array_compact(references.raw)) as num_raw,
    size(array_compact(references.authors)) as num_authors
  FROM openalex.works.locations_parsed
  WHERE references is not null and size(references) > 0
  AND NOT(size(references) = 1 and references[0].doi is null 
    and references[0].pmid is null and references[0].title is null 
    and references[0].arxiv is null and references[0].raw is null)
)
SELECT provenance,
  format_number(count(*),0) as total_record_count,
  format_number(sum(size(references)),0) as total_reference_count,
  format_number(sum(num_dois),0) as total_dois, 
  format_number(sum(num_pmids),0) as total_pmids, 
  format_number(sum(num_titles),0) as total_titles,   
  format_number(sum(num_authors),0) as total_authors,
  format_number(sum(num_raw),0) as total_raw_strings,  
  format_number(count_if(num_dois > 0),0) as has_any_dois,
  format_number(count_if(num_pmids > 0),0) as has_any_pmids, 
  format_number(count_if(num_titles > 0),0) as has_any_titles,  
  format_number(count_if(num_authors > 0),0) as has_any_authors,
  format_number(count_if(num_raw > 0),0) as has_any_raw_strings
FROM filtered_refs group by provenance order by 2 desc

### Aggregate before `openalex_works`

In [0]:
%sql
CREATE TABLE openalex.works.referenced_works
SELECT
  citing_work_id,
  TRANSFORM(
    SORT_ARRAY(COLLECT_LIST(STRUCT(ref_ind, cited_work_id))),
    x -> x.cited_work_id
  ) AS referenced_works
FROM openalex.works.work_references
WHERE cited_work_id IS NOT NULL
GROUP BY citing_work_id;
OPTIMIZE openalex.works.referenced_works ZORDER BY (citing_work_id);

### PARSING ENHANCEMENTS - TBD

In [0]:
# %python
# %pip install /Volumes/openalex/default/libraries/openalex_dlt_utils-0.2.3-py3-none-any.whl

In [0]:
# from nameparser import HumanName # Will be installed via pipeline libraries
# from openalex.dlt.normalize import normalize_title_udf, udf_last_name_only
# spark.udf.register("normalize_title_udf", normalize_title_udf)
# spark.udf.register("udf_last_name_only", udf_last_name_only)

### Extract `DOI`
There are more than a dozen edge cases recovering additional 10M

In [0]:
# %python
# import re
# from pyspark.sql.types import *


# # allow spaces in the registry digits: (?:\d\s*){4,9}
# _DOI_PREFIX = r'10\s*\.\s*(?:\d\s*){4,9}\s*/\s*'
# _DOI_BODY   = r'(?:[-._;()/:A-Za-z0-9%]|\s)+'

# # https://... (tolerant of spaces, and supports (dx|doi).org)
# _URL_DOI_OPT = re.compile(
#     r'https?\s*:?\s*//\s*(?:[a-z0-9-]+\s*\.)*(?:doi|dx)\.org/\s*(' + _DOI_PREFIX + _DOI_BODY + r')',
#     re.IGNORECASE
# )

# # no-scheme "... (dx|doi).org/10..."
# _NOSCHEME_DOI = re.compile(
#     r'(?:(?<=\s)|^|[\(\[\{])(?:[a-z0-9-]+\s*\.)*(?:doi|dx)\.org/\s*(' + _DOI_PREFIX + _DOI_BODY + r')',
#     re.IGNORECASE
# )

# # doi: org/10...   OR   doi: doi.org/10...   OR   doi: dx.org/10...
# _COLON_DOI_ORG = re.compile(
#     r'\bdoi\s*:\s*(?:(?:[a-z0-9-]+\s*\.)*(?:doi|dx)\s*\.\s*)?org\s*/\s*(' + _DOI_PREFIX + _DOI_BODY + r')',
#     re.IGNORECASE
# )

# # direct "doi: 10...."
# _COLON_DOI_DIRECT = re.compile(
#     r'\bdoi\s*:\s*(' + _DOI_PREFIX + _DOI_BODY + r')',
#     re.IGNORECASE
# )

# _TRAIL = re.compile(r'[\]\)\}\.,;:]+$')

# def _normalize_after_gate(s: str) -> str:
#     s = s.replace('\r', ' ').replace('\n', ' ')
#     s = ' '.join(s.split())
#     s = re.sub(r'(?i)(doi\.)\s*org', r'\1org', s)
#     s = re.sub(r'(?i)(dx\.)\s*org', r'\1org', s)
#     s = re.sub(r'(?i)\bhttps\s*//', 'https://', s)
#     s = re.sub(r'(?i)\bhttp\s*//', 'http://', s)
#     # decode URL-encoded slashes anywhere
#     s = re.sub(r'(?i)%\s*2\s*f', '/', s)
#     return s

# def extract_doi(text: str):
#     """
#     Return DOI only if raw contains 'doi:', 'doi.' (e.g., doi.org), or 'dx.'.
#     Accepts `doi: org/10...` as well as `doi: doi.org/10...` and `doi: dx.org/10...`.
#     """
#     if not text:
#         return None
#     low = text.lower()
#     if ('doi:' not in low) and ('doi.' not in low) and ('dx.' not in low):
#         return None

#     s = _normalize_after_gate(text)

#     m = (_NOSCHEME_DOI.search(s) or
#          _URL_DOI_OPT.search(s) or
#          _COLON_DOI_ORG.search(s) or
#          _COLON_DOI_DIRECT.search(s))
#     if not m:
#         return None

#     doi = re.sub(r'\s+', '', m.group(1))   # strip internal spaces
#     doi = _TRAIL.sub('', doi)              # trim trailing .,;:)]}
#     return doi or None

#### Register UDFs

In [0]:
# extract_doi_udf = udf(extract_doi, StringType())
# spark.udf.register("extract_doi_udf", extract_doi_udf)

#### Test

In [0]:
# %python
# raw_input = [
#     "McGrath, R. (2015). Character strengths in 75 nations: An update. The Journal of Positive Psychology, 10(1), 41-52. https://doi.org/10.1080/17439760.2014.888580",
#     "Sun J, Ajwani D, Nicholson PK, Sala A and Parthasarathy S (2017) Breaking Cycles In Noisy Hierarchies. Proceedings of the 2017 ACM on Web Science Conference: 151-160 http://dx.doi. org/10.1145/3091478.3091495",
#     "Banks PA, Bollen TL, Dervenis C, Gooszen HG, Johnson CD, Sarr MG, Tsiotos GG, Vege SS; Acute Pancreatitis Classification Working Group. Classification of acute pancreatitis--2012: revision of the Atlanta classification and definitions by international consensus. Gut. 2013;62(1):102-11. doi: 10.1136/gutjnl-2012-302779. [bleh]",
#     "Araujo MCR, da Silva DA, Wilson AMMM. Nursing interven- tions in palliative care in the intensive care unit: A system- atic review. Enfermeria Intensiva. 2023 34: 156-172. https: //doi.org/10.1016/j.enfie.2023.08.008.",
#     "LEMASTER James C., \"Zwizek midzy Baconem, teleologi i analogi a doktryn naturali- zmu metodologicznego\", prze. Dariusz Sagan, Filozoficzne Aspekty Genezy 2017, t. 14, s. 99-133, http://www.nauka-a-religia.uz.zgora.pl/images/FAG/2017.t.14/art.04.pdf (14.10. 2018). MAYR Ersnt, \"Cause and Effect in Biology\", Science 1961, vol. 134, s. 1501-1506, doi:10. 1126/science.134.3489.1501.",
#     "Mazumder, M.A.R., Hossain, M. M. and Akhtar, S. 1998. Effect of levels of concentrate supplement on live weight gain and carcass characteristics in sheep on restricted grazing. Asian Aust. J. Anim. Sci., 11: 17-21. doi.org/10.5713/ajas.1998.",
#     "Lima RF, Toledo MI, Naves JOS. Avaliao de servios far- macuticos hospitalares: uma reviso integrativa. Rev Bras Farm Hosp Serv Sade, 2019; 9(2):01-08. DOI: 10.3068/ rbfhss.2018.092.005",
#     "A. Buonomo, A. Lo Schiavo. Divide-by-Three Injection-Locked Frequency Dividers with Direct Forcing Signal // Hindawi Publishing Corporation Journal of Electrical and Computer Engineering, article ID 145314, 9 p., 2013, doi: org/10.1155/2013/145314.",
#     "Fernndez, H., & Macbeth, G. (2018). Perspectiva de tiempo futuro, metas y sub-metas: su rol en la toma de decisiones. Revista Latinoamericana de Cien- cia Psicolgica, 10, https//doi.org/10.5872/psien- cia/10.2.23",
#     "Hakanson, L. (1980). An ecological risk index for aquatic pollution control.a sedimentological approach. Water ________________________ Egypt. J. Geo. Vol. 67 (2023) Research, 14(8), 975-1001. https://doi.org/10.101 6/0043-1354(80)90143-8.",
#     "Davidson MH, Christine M, Ballantyne CM, Jacobson TA, Bittner VA, Braun LT, et al. (2011). Clinical utility of inflammatory markers and advanced lipoprotein testing: advice from an expert panel of lipid specialists. J Clin Lipidol 5(5): 338-367. DOI: https://dx.org/10.1016/J.Jacl.2011.07.005",
#     "Ren J, Hong T, He C, Sun L, Li X, Ma Y, et al. Coexistence of Intracranial and Spinal Cord Cavernous Malformations Predict Aggressive Clinical Presentation. Presentation. Front. Neurol. 2019; 10:618. https://doi.org/10.3389%2Ffneur.2019.00618"
# ]

# for i in raw_input:
#     print(extract_doi(i))


#### Spotcheck

In [0]:
# %sql
# SELECT *
# FROM openalex.works.work_references
# WHERE doi IS NULL
#   AND parsed_doi IS NULL
#   AND raw IS NOT NULL
#   AND (CONTAINS(lower(raw), 'doi:') OR CONTAINS(lower(raw), 'doi.'))
# LIMIT 2000;

### Extract `FIRST AUTHOR`

In [0]:
# import re

# # One initial (we’ll keep only the first), allowing dots/hyphens/spaces between letters
# _INITIALS = r'(?:[A-Z](?:[.\-]?\s*)?){1,3}'

# # A LAST-NAME TOKEN must either contain a lowercase somewhere OR be ALL-CAPS (≥3 chars).
# # This prevents "AJ" / "BT" from being misread as surnames, while allowing "LEMASTER".
# _LAST_TOKEN = r"(?:[A-Z][A-Za-z'’\-]*[a-z][A-Za-z'’\-]*|[A-Z]{3,}[A-Za-z'’\-]*)"
# _LAST_MULTI = rf"(?:{_LAST_TOKEN}(?:\s+{_LAST_TOKEN}){{0,2}})"

# # A spelled given name must include a lowercase (blocks 'LEMASTER' from counting as given)
# _GIVENWORD = r"[A-Z][a-z][A-Za-z'’\-]*"

# # Patterns (anchored at start)
# P_COMMA_GIVEN  = re.compile(rf'^\s*(?P<last>{_LAST_MULTI})\s*,\s*(?P<given>{_GIVENWORD})\b')
# P_COMMA_INIT   = re.compile(rf'^\s*(?P<last>{_LAST_MULTI})\s*,\s*(?P<inits>{_INITIALS})\b')
# P_GIVEN_LAST   = re.compile(rf'^\s*(?P<inits>{_INITIALS})\s+(?P<last>{_LAST_MULTI})\b')
# P_LAST_INIT    = re.compile(rf'^\s*(?P<last>{_LAST_MULTI})\s+(?P<inits>{_INITIALS})\b')
# P_LAST_GIVENI  = re.compile(rf'^\s*(?P<last>{_LAST_MULTI})\s+(?P<given>{_GIVENWORD})(?:\s+(?P<inits>{_INITIALS}))?\b')
# P_GIVENWORD_LAST = re.compile(rf'^\s*(?P<given>{_GIVENWORD})\s+(?P<last>{_LAST_MULTI})\b')

# # Hard separators between *authors* (don’t split on comma—needed for "Last, I")
# _SEP = re.compile(r'\s*(?:;|&|\band\b)\s*', re.IGNORECASE)

# def _norm(s: str) -> str:
#     if not s:
#         return ""
#     s = s.replace("\r", " ").replace("\n", " ")
#     s = re.sub(r"([A-Za-z])\s*-\s*([A-Za-z])", r"\1\2", s)  # heal "prob- lem"
#     return " ".join(s.split()).strip()

# def _first_initial(*chunks: str) -> str | None:
#     """Return the first A–Z found across chunks (prefer given over initials)."""
#     for c in chunks:
#         if not c:
#             continue
#         m = re.search(r'[A-Z]', c)
#         if m:
#             return m.group(0)
#     return None

# def first_author_last_initial(text: str) -> str | None:
#     """
#     Return 'Last, I' (single initial, no dots) for the first author in `text`.
#     """
#     s = _norm(text or "")
#     head = _SEP.split(s, maxsplit=1)[0]  # only the first author segment

#     # Order matters:
#     # 1) "Last, Given" (e.g., "Bravetti, Margarita")
#     m = P_COMMA_GIVEN.match(head)
#     if m:
#         ini = _first_initial(m.group('given'))
#         return f"{m.group('last').strip()}, {ini}" if ini else m.group('last').strip()

#     # 2) "GivenInitials Last" (e.g., "AJ Smith", "A.J. Smith")
#     m = P_GIVEN_LAST.match(head)
#     if m:
#         ini = _first_initial(m.group('inits'))
#         return f"{m.group('last').strip()}, {ini}"

#     # 3) "Last Initials" (e.g., "Smith AJ", "Del Fabbro M")
#     m = P_LAST_INIT.match(head)
#     if m:
#         ini = _first_initial(m.group('inits'))
#         return f"{m.group('last').strip()}, {ini}"

#     # 4) "Last, Initials" (e.g., "Smith, AJ")
#     m = P_COMMA_INIT.match(head)
#     if m:
#         ini = _first_initial(m.group('inits'))
#         return f"{m.group('last').strip()}, {ini}"

#     # 5) "Last Given [Initials]" (e.g., "LEMASTER James C.", "Del Fabbro Mario C.")
#     m = P_LAST_GIVENI.match(head)
#     if m:
#         # prefer the initial from the GIVEN word, not the trailing initials
#         ini = _first_initial(m.group('given'), m.group('inits'))
#         return f"{m.group('last').strip()}, {ini}" if ini else m.group('last').strip()

#     # 6) "Given Last" (e.g., "Nicole Johnson")
#     m = P_GIVENWORD_LAST.match(head)
#     if m:
#         ini = _first_initial(m.group('given'))
#         return f"{m.group('last').strip()}, {ini}"

#     # Fallback: first capitalized token as last
#     m = re.match(r"^\s*([A-Z][A-Za-z'’\-]+)", head)
#     return m.group(1) if m else None

#### Register UDFs

In [0]:
# first_author_last_initial_udf = udf(first_author_last_initial, StringType())
# spark.udf.register("first_author_last_initial", first_author_last_initial)

#### Test

In [0]:
# tests = [
#     "A Smith",
#     "AB Smith",
#     "A Johnson, B Smith",
#     "AJ Smith, BT Johnson",
#     "A.J. Smith and B.T. Johnson",
#     "Smith AJ, Johnson BT",
#     "Smith A.J., Johnson B.T.",
#     "Smith, AJ, Johnson, BT",
#     "Del Fabbro M, Taschieri S",
#     "H. N. Gabow and R. E. Tarjan, ...",
#     "LEMASTER James C., ...",
#     "Nicole Johnson",
#     "Bravetti, Margarita",
#     "Cunha HF, Morais PPAM (2010) Relao espcie-rea em cupinzeiros de pastagem, Goinia-GO, Brasil. Entomo Brasilis. 3(3):60-63.",
#     "Glat, R., Ferreira, J. R., Oliveira, E. S. G., & Senna, L. A. G. (2003). Panorama nacional da educao inclusiva no Brasil. Relatrio de consultoria tcnica, Banco Mundial. Recuperado de www.cnotinfor.pt/projectos/worldbank/inclusiva"
# ]
# for t in tests:
#     print(t, "->", first_author_last_initial(t))


#### Spotcheck

In [0]:
# %sql
# SELECT title, authors, raw
# FROM openalex.works.work_references
# WHERE doi IS NULL and title is not null and authors is not null
# LIMIT 2000;

In [0]:
# %sql
# UPDATE openalex.works.work_references
# SET parsed_first_author = first_author_last_initial(authors)
# WHERE authors IS NOT NULL;

### Extract `TITLE`

In [0]:
# import re

# _QUOTES = "'\"“”‘’«»"
# BAD_START = re.compile(r"^(?:doi|dx|https?|www\.|vol\.?|volume|article id|issn|isbn)\b", re.IGNORECASE)
# YEAR_PAREN = re.compile(r"\(\s*(?:19|20)\d{2}\s*\)")

# def _norm(s: str) -> str:
#     if not s:
#         return ""
#     s = s.replace("\r", " ").replace("\n", " ")
#     # heal line-break hyphenation only (keep real hyphens)
#     s = re.sub(r"([A-Za-z])-\s+([A-Za-z])", r"\1\2", s)
#     return re.sub(r"\s+", " ", s).strip()

# def _next_nonspace(s: str, i: int) -> str:
#     n = len(s)
#     while i < n and s[i].isspace():
#         i += 1
#     return s[i] if i < n else ""

# def _is_initial_before(s: str, dot_idx: int) -> bool:
#     # True if just before '.' there is a single capital letter token (e.g., "A.")
#     return bool(re.search(r"[^\w][A-Z]\.$|^[A-Z]\.$", s[:dot_idx+1]))

# def _sentence_break_positions(s: str):
#     """Yield indices of '.' that are sentence breaks."""
#     for m in re.finditer(r"\.", s):
#         i = m.start()
#         if _is_initial_before(s, i):
#             continue  # don't break on initials like "A."
#         nxt = _next_nonspace(s, i+1)
#         if nxt and nxt.isalpha() and nxt.islower():
#             continue  # "control.a" or ". a" typo: not a break
#         yield i

# def _split_sentences(s: str):
#     """Return list of sentence strings, strictly cut at our sentence breaks."""
#     breaks = list(_sentence_break_positions(s))
#     if not breaks:
#         return [s] if s else []
#     out = []
#     start = 0
#     for i in breaks:
#         out.append(s[start:i].strip())
#         start = i+1
#     if start < len(s):
#         out.append(s[start:].strip())
#     return [seg for seg in out if seg]

# def _strip_leading(t: str) -> str:
#     # drop leading punctuation, brackets, quotes, and a leading year if present
#     t = t.lstrip()
#     t = re.sub(r"^\(\s*(?:19|20)\d{2}\s*\)\s*[.:,]*\s*", "", t)
#     t = re.sub(r"^(?:19|20)\d{2}\s*[.:,]*\s*", "", t)
#     t = re.sub(r"^[\(\)\[\]\.\:,\;]+", "", t).lstrip()
#     t = t.lstrip(_QUOTES)
#     return t

# def _cut_at_double_slash(t: str) -> str:
#     m = re.search(r"\s//\s|//", t)
#     return t[:m.start()].strip() if m else t.strip()

# def _first_quoted_title(s: str) -> str | None:
#     q = re.compile(r"[" + re.escape(_QUOTES) + r"]\s*([^" + re.escape(_QUOTES) + r"]{3,}?)\s*[" + re.escape(_QUOTES) + r"]")
#     for m in q.finditer(s):
#         cand = m.group(1).strip().strip(_QUOTES).strip().rstrip(",;:")
#         if " " in cand and not BAD_START.match(cand):
#             return cand
#     return None

# def _looks_like_authors(seg: str) -> bool:
#     return bool(
#         re.search(r"\b[A-Z]\.", seg) or      # initials
#         re.search(r",", seg) or              # commas delimiting authors
#         re.search(r"\band\b|&|;", seg, re.IGNORECASE)
#     )

# def extract_title(text: str) -> str | None:
#     try:
#         if text is None:
#             return None
#         s = _norm(text)
#         if not s:
#             return None

#         # 0) Quoted titles take precedence anywhere
#         qt = _first_quoted_title(s)
#         if qt:
#             return qt

#         # 1) Split into sentences (strict, first-period rule)
#         sentences = _split_sentences(s)
#         if not sentences:
#             return None

#         # 2) If a (YYYY) occurs in sentence k, prefer sentence k+1 as title
#         for idx, seg in enumerate(sentences):
#             if YEAR_PAREN.search(seg):
#                 if idx + 1 < len(sentences):
#                     cand = _cut_at_double_slash(_strip_leading(sentences[idx + 1]))
#                     return None if not cand or BAD_START.match(cand) else cand
#                 break

#         # 3) If first sentence looks like authors, pick the next one
#         if _looks_like_authors(sentences[0]) and len(sentences) > 1:
#             cand = _cut_at_double_slash(_strip_leading(sentences[1]))
#             return None if not cand or BAD_START.match(cand) else cand

#         # 4) Else use the first sentence
#         cand = _cut_at_double_slash(_strip_leading(sentences[0]))
#         return None if not cand or BAD_START.match(cand) else cand
#     except Exception as e:
#         # Handle the exception
#         return f"Exception: {e}"


#### Test

In [0]:
# %python
# raw_input = [
#     "McGrath, R. (2015). Character strengths in 75 nations: An update. The Journal of Positive Psychology, 10(1), 41-52. https://doi.org/10.1080/17439760.2014.888580",
#     "Sun J, Ajwani D, Nicholson PK, Sala A and Parthasarathy S (2017) Breaking Cycles In Noisy Hierarchies. Proceedings of the 2017 ACM on Web Science Conference: 151-160 http://dx.doi. org/10.1145/3091478.3091495",
#     "Banks PA, Bollen TL, Dervenis C, Gooszen HG, Johnson CD, Sarr MG, Tsiotos GG, Vege SS; Acute Pancreatitis Classification Working Group. Classification of acute pancreatitis--2012: revision of the Atlanta classification and definitions by international consensus. Gut. 2013;62(1):102-11. doi: 10.1136/gutjnl-2012-302779. [bleh]",
#     "Araujo MCR, da Silva DA, Wilson AMMM. Nursing interven- tions in palliative care in the intensive care unit: A system- atic review. Enfermeria Intensiva. 2023 34: 156-172. https: //doi.org/10.1016/j.enfie.2023.08.008.",
#     "LEMASTER James C., \"Zwizek midzy Baconem, teleologi i analogi a doktryn naturali- zmu metodologicznego\", prze. Dariusz Sagan, Filozoficzne Aspekty Genezy 2017, t. 14, s. 99-133, http://www.nauka-a-religia.uz.zgora.pl/images/FAG/2017.t.14/art.04.pdf (14.10. 2018). MAYR Ersnt, \"Cause and Effect in Biology\", Science 1961, vol. 134, s. 1501-1506, doi:10. 1126/science.134.3489.1501.",
#     "Mazumder, M.A.R., Hossain, M. M. and Akhtar, S. 1998. Effect of levels of concentrate supplement on live weight gain and carcass characteristics in sheep on restricted grazing. Asian Aust. J. Anim. Sci., 11: 17-21. doi.org/10.5713/ajas.1998.",
#     "Lima RF, Toledo MI, Naves JOS. Avaliao de servios far- macuticos hospitalares: uma reviso integrativa. Rev Bras Farm Hosp Serv Sade, 2019; 9(2):01-08. DOI: 10.3068/ rbfhss.2018.092.005",
#     "A. Buonomo, A. Lo Schiavo. Divide-by-Three Injection-Locked Frequency Dividers with Direct Forcing Signal // Hindawi Publishing Corporation Journal of Electrical and Computer Engineering, article ID 145314, 9 p., 2013, doi: org/10.1155/2013/145314.",
#     "Fernndez, H., & Macbeth, G. (2018). Perspectiva de tiempo futuro, metas y sub-metas: su rol en la toma de decisiones. Revista Latinoamericana de Cien- cia Psicolgica, 10, https//doi.org/10.5872/psien- cia/10.2.23",
#     "Hakanson, L. (1980). An ecological risk index for aquatic pollution control.a sedimentological approach. Water ________________________ Egypt. J. Geo. Vol. 67 (2023) Research, 14(8), 975-1001. https://doi.org/10.101 6/0043-1354(80)90143-8.",
#     "Davidson MH, Christine M, Ballantyne CM, Jacobson TA, Bittner VA, Braun LT, et al. (2011). Clinical utility of inflammatory markers and advanced lipoprotein testing: advice from an expert panel of lipid specialists. J Clin Lipidol 5(5): 338-367. DOI: https://dx.org/10.1016/J.Jacl.2011.07.005",
#     "Ren J, Hong T, He C, Sun L, Li X, Ma Y, et al. Coexistence of Intracranial and Spinal Cord Cavernous Malformations Predict Aggressive Clinical Presentation. Presentation. Front. Neurol. 2019; 10:618. https://doi.org/10.3389%2Ffneur.2019.00618"
# ]

# for i in raw_input:
#     print(extract_title(i))

#### Register UDFs

In [0]:
# import pandas as pd
# from pyspark.sql.functions import *

# extract_title_udf = udf(extract_title, StringType())
# spark.udf.register("extract_title_udf", extract_title_udf)

# @pandas_udf(StringType())
# def extract_title_pandas_udf(title_series: pd.Series) -> pd.Series:
#     # This Pandas UDF calls your original 'normalize_license' Python function
#     return title_series.apply(extract_title)
# spark.udf.register("extract_title_pandas_udf", extract_title_pandas_udf)

#### Update `parsed_title`

In [0]:
# %sql
# MERGE INTO openalex.works.work_references AS target
# USING (
#     SELECT ref_ind, citing_work_id, 
#     extract_title_pandas_udf(substr(raw, 1, 3000)) AS new_parsed_title
#     FROM openalex.works.work_references
#     WHERE cited_work_id IS NULL AND raw IS NOT NULL
#       AND parsed_title IS NULL
#       AND title IS NULL
#       AND doi IS NULL
#       AND parsed_doi IS NULL
# ) AS source
# ON target.raw = source.raw
#   AND target.citing_work_id = source.citing_work_id
# WHEN MATCHED THEN
#   UPDATE SET target.parsed_title = source.new_parsed_title;


### NORMALIZE TITLE

In [0]:
# %sql
# UPDATE openalex.works.work_references
# SET normalized_title = normalize_title_udf(title)
# WHERE title IS NOT NULL and normalized_title IS NULL;

In [0]:
# %sql
# UPDATE openalex.works.work_references
# SET normalized_title = normalize_title_udf(coalesce(title, parsed_title))
# WHERE normalized_title is null and (parsed_title IS NOT NULL OR title IS NOT NULL);

In [0]:
# %sql 
# SELECT udf_last_name_only(array(
#   named_struct('name', "Meier, O. and Quille, G. "),
#   named_struct('name', "Wilson, M. , Stearne, A , Gray, D & Saggers, S"),
#   named_struct('name', "Schroder, F"),
#   named_struct('name', 'ROLLET, Catherine, MOREL, Marie-France'),
#   named_struct('name', 'Ohmoto A, Fuji S.'),
#   named_struct('name', 'Araujo MCR, da Silva DA, Wilson AMMM.'),
#   named_struct('name', 'S.O. Roberts, E. Mortenson')
# )).author_key

In [0]:
# %sql
# UPDATE openalex.works.work_references
# SET author_key = udf_last_name_only(array(named_struct('name', authors)))[0].author_key
# WHERE author_key IS NULL and authors is NOT NULL;

In [0]:
# %sql
# UPDATE openalex.works.work_references
# SET parsed_doi = extract_doi_udf(raw)
# WHERE doi IS NULL
#   AND parsed_doi IS NULL
#   AND raw IS NOT NULL
#   AND (CONTAINS(lower(raw), 'doi:') OR CONTAINS(lower(raw), 'doi.'));

### SAMPLE EVERYTHING

In [0]:
# %sql
# SELECT raw, openalex.agents.parse_raw_reference(raw) as parsed
# FROM openalex.agents.work_references_raw_sample
# LIMIT 1000;

In [0]:
# %sql
# -- Replace my_db.my_table with your table; expects column `raw`
# WITH norm AS (
#   SELECT
#     raw,
#     -- normalize whitespace, fix hyphenated line-break words, and remove spaces after doi.org/
#     regexp_replace(
#       regexp_replace(
#         regexp_replace(
#           regexp_replace(raw, '[\\r\\n]+', ' '),               -- newlines -> space
#           '\\s+', ' '                                          -- collapse spaces
#         ),
#         '([A-Za-z])\\s*-\\s*([A-Za-z])', '\\1\\2'              -- join "Ohmu- ra" -> "Ohmura"
#       ),
#       '(?i)(doi\\.org/)\\s+', '\\1'                            -- kill spaces after doi.org/
#     ) AS s
#   FROM openalex.agents.work_references_raw_sample
# ),
# parts AS (
#   SELECT
#     raw, s,

#     -- AUTHORS: everything left of the first 19xx/20xx (optionally in parentheses)
#     trim(
#       regexp_replace(
#         regexp_extract(s, '^(.*?)\\s*\\(?\\b(?:19|20)\\d{2}\\b', 1),
#         '\\s*[\\.,;:]+\\s*$', ''   -- strip trailing punctuation
#       )
#     ) AS authors_from_year,

#     -- TITLE (primary): after year (with (), . or :) up to the next period
#     trim(
#       regexp_extract(
#         s,
#         '(?:\\(|\\b)(?:19|20)\\d{2}(?:\\))?\\s*[\\.:]?\\s*(.+?)\\.',
#         1
#       )
#     ) AS title_after_year,

#     -- TITLE (fallback): second sentence (helps when year is after the journal block)
#     trim(regexp_extract(s, '^[^.]*\\.\\s*([^.]+?)\\.', 1)) AS title_second_sentence,

#     -- DOI to end-of-string (prefer URL, then "doi: 10...", then bare "10...")
#     trim(regexp_extract(s, '(?i)(https?://\\s*doi\\.org/\\S.*)$', 1)) AS doi_url_eos,
#     trim(regexp_extract(s, '(?i)(doi\\s*:\\s*10\\.\\d{4,9}/\\S.*)$', 1)) AS doi_colon_eos,
#     trim(regexp_extract(s, '(?i)\\b(10\\.\\d{4,9}/\\S.*)$', 1))       AS doi_bare_eos
#   FROM norm
# )
# SELECT
#   raw,

#   -- AUTHORS per your rule
#   NULLIF(authors_from_year, '') AS authors,

#   -- TITLE: prefer after-year; if too short/missing, use second sentence before year
#   CASE
#     WHEN title_after_year IS NOT NULL AND length(title_after_year) >= 8 THEN title_after_year
#     WHEN title_second_sentence IS NOT NULL AND length(title_second_sentence) >= 8 THEN title_second_sentence
#     ELSE NULL
#   END AS title,

#   -- DOI to end-of-string (normalize minor spacing)
#   CASE
#     WHEN doi_url_eos    <> '' THEN regexp_replace(doi_url_eos,    '(?i)(doi\\.org/)\\s+', '\\1')
#     WHEN doi_colon_eos  <> '' THEN regexp_replace(doi_colon_eos,  '(?i)(doi\\s*:\\s*)\\s*', '\\1')
#     WHEN doi_bare_eos   <> '' THEN doi_bare_eos
#     ELSE NULL
#   END AS doi_to_eos

# FROM parts;
