### Creates and maintains `openalex.authors.parsed_names_lookup` table

In [0]:
%pip install nameparser

In [0]:
%sql
CREATE TABLE IF NOT EXISTS identifier('openalex' || :env_suffix || '.authors.parsed_names_lookup') (
  raw_author_name STRING,
  parsed_name STRUCT<
      title: STRING,
      first: STRING,
      middle: STRING,
      last: STRING,
      suffix: STRING,
      nickname: STRING
  >,
  created_datetime TIMESTAMP
)
USING DELTA
CLUSTER BY (raw_author_name)

#### Name parser

In [0]:
import unicodedata
import re
from nameparser import HumanName

import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType

dbutils.widgets.text("env_suffix", "", "Environment Suffix")
env_suffix = dbutils.widgets.get("env_suffix")

CUTOFF_DATE = "2025-12-01"

parsed_name_schema = StructType([
    StructField('title', StringType(), True),
    StructField('first', StringType(), True),
    StructField('middle', StringType(), True),
    StructField('last', StringType(), True),
    StructField('suffix', StringType(), True),
    StructField('nickname', StringType(), True)
])

# -- Begin CJK support --

# Common two-character Chinese surnames (复姓)
COMPOUND_SURNAMES = {
    '欧阳', '太史', '端木', '上官', '司马', '东方', '独孤', '南宫', '万俟', 
    '闻人', '夏侯', '诸葛', '尉迟', '公羊', '赫连', '澹台', '皇甫', '宗政',
    '濮阳', '公冶', '太叔', '申屠', '公孙', '慕容', '仲孙', '钟离', '长孙',
    '宇文', '司徒', '鲜于', '司空', '闾丘', '子车', '亓官', '司寇', '巫马',
    '公西', '颛孙', '壤驷', '公良', '漆雕', '乐正', '宰父', '谷梁', '拓跋',
    '夹谷', '轩辕', '令狐', '段干', '百里', '呼延', '东郭', '南门', '羊舌',
    '微生', '公户', '公玉', '公仪', '梁丘', '公仲', '公上', '公门', '公山',
    '公坚', '左丘', '公伯', '西门', '公祖', '第五', '公乘', '贯丘', '公皙',
    '南荣', '东里', '东宫', '仲长', '子书', '子桑', '即墨', '达奚', '褚师',
    # Traditional variants
    '歐陽', '司馬', '東方', '獨孤', '南宮', '諸葛', '尉遲', '赫連', '澹臺',
    '皇甫', '濮陽', '慕容', '鍾離', '長孫', '宇文', '鮮于', '閭丘', '顓孫',
    '漆雕', '樂正', '穀梁', '拓跋', '夾谷', '軒轅', '段幹', '東郭', '南門',
    '羊舌', '梁丘', '左丘', '西門', '東里', '東宮', '仲長'
}

def is_cjk(char):
    """Check if a character is CJK (Chinese/Japanese/Korean)."""
    cp = ord(char)
    return (
        (0x4E00 <= cp <= 0x9FFF) or    # CJK Unified Ideographs
        (0x3400 <= cp <= 0x4DBF) or    # CJK Unified Ideographs Extension A
        (0x20000 <= cp <= 0x2A6DF) or  # CJK Unified Ideographs Extension B
        (0xF900 <= cp <= 0xFAFF) or    # CJK Compatibility Ideographs
        (0x2F800 <= cp <= 0x2FA1F)     # CJK Compatibility Ideographs Supplement
    )

def is_all_cjk(s):
    """Check if string is entirely CJK characters (ignoring whitespace)."""
    chars = s.replace(' ', '')
    return len(chars) > 0 and all(is_cjk(c) for c in chars)

def split_chinese_name(name):
    """
    Split a Chinese name written without spaces into (surname, given_name).
    Assumes Eastern order (surname first).
    """
    # Check for compound surname first
    if len(name) >= 2 and name[:2] in COMPOUND_SURNAMES:
        return name[:2], name[2:]
    # Default: single-character surname
    return name[0], name[1:]

# -- End CJK support --


def parse_name(name_string):
    """
    Parse a name, with special handling for unsplit Chinese names.
    Returns a HumanName object with corrected first/last for Chinese names.
    """
    name_string = name_string.strip()
    
    # Only special case: All CJK with no spaces - assume Eastern order, split it
    if is_all_cjk(name_string) and ' ' not in name_string:
        surname, given = split_chinese_name(name_string)
        result = HumanName()
        result.last = surname
        result.first = given
        return result
    
    # Everything else: use standard nameparser (assumes Western order)
    return HumanName(name_string)


def clean_name_component(s: str) -> str:
    """Remove diacritics, lowercase, and remove periods."""
    if s is None or s == '':
        return ''
    # Remove diacritics (normalize to NFD, remove combining characters)
    normalized = unicodedata.normalize('NFD', s)
    without_diacritics = ''.join(c for c in normalized if unicodedata.category(c) != 'Mn')
    # Lowercase and remove periods
    cleaned = without_diacritics.lower().replace('.', '')
    # Collapse multiple spaces
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned


def parse_human_name(name: str) -> dict:
    """
    Parse a name using nameparser.HumanName and return cleaned components.
    Includes special handling for CJK names without spaces.
    Returns dict with: title, first, middle, last, suffix, nickname
    All values are cleaned: diacritics removed, lowercased, periods removed.
    """
    if name is None or not isinstance(name, str) or name.strip() == '':
        return {
            'title': '',
            'first': '',
            'middle': '',
            'last': '',
            'suffix': '',
            'nickname': ''
        }
    
    try:
        parsed = parse_name(name)  # use our CJK-aware parser
        return {
            'title': clean_name_component(parsed.title),
            'first': clean_name_component(parsed.first),
            'middle': clean_name_component(parsed.middle),
            'last': clean_name_component(parsed.last),
            'suffix': clean_name_component(parsed.suffix),
            'nickname': clean_name_component(parsed.nickname)
        }
    except Exception:
        return {
            'title': '',
            'first': '',
            'middle': '',
            'last': '',
            'suffix': '',
            'nickname': ''
        }

@F.pandas_udf(parsed_name_schema)
def parse_names_batch(names: pd.Series) -> pd.DataFrame:
    """Vectorized UDF to parse a batch of names."""
    results = [parse_human_name(n) for n in names]
    return pd.DataFrame(results)

#### Get new names to process

In [0]:
new_names_df = spark.sql(f"""
    WITH distinct_names AS (
        SELECT DISTINCT TRIM(author.name) AS raw_author_name
        FROM openalex{env_suffix}.works.locations_mapped
        LATERAL VIEW explode(authors) AS author
        WHERE author.name IS NOT NULL 
          AND TRIM(author.name) != ''
          AND openalex_updated_dt >= '{CUTOFF_DATE}'
        
        UNION
        
        SELECT DISTINCT TRIM(longest_name) AS raw_author_name
        FROM openalex.authors.openalex_authors
        WHERE longest_name IS NOT NULL 
          AND TRIM(longest_name) != ''
    )
    SELECT dn.raw_author_name
    FROM distinct_names dn
    LEFT ANTI JOIN openalex{env_suffix}.authors.parsed_names_lookup existing
        ON dn.raw_author_name = existing.raw_author_name
""")

new_count = new_names_df.count()
print(f"Found {new_count:,} new distinct author names to parse")

#### Run it

In [0]:
if new_count > 0:
    parsed_df = new_names_df.withColumn(
        "parsed_name", parse_names_batch(F.col("raw_author_name"))
    ).withColumn(
        "created_datetime", F.current_timestamp()
    )
    
    parsed_df.write.format("delta").mode("append").saveAsTable(
        f"openalex{env_suffix}.authors.parsed_names_lookup"
    )
    
    print(f"Added {new_count:,} parsed names to lookup table")
else:
    print("No new names to parse")