### Seed `parsed_pages` from `taxicab_enriched_new`

One-time migration: extracts Parseland response fields + metadata from
`openalex.landing_page.taxicab_enriched_new` (~59M records) into
`openalex.parseland.parsed_pages`.

This ensures zero data loss when the DLT pipeline switches from calling
Parseland directly to reading from `parsed_pages`.

**Run once, then delete or archive this notebook.**

In [None]:
from pyspark.sql import functions as F

In [None]:
# Read existing enriched data
enriched = spark.read.table("openalex.landing_page.taxicab_enriched_new")
print(f"taxicab_enriched_new: {enriched.count():,} records")

In [None]:
# Check what's already in parsed_pages to avoid duplicates
already_seeded = spark.read.table("openalex.parseland.parsed_pages").select("taxicab_id")
already_count = already_seeded.count()
print(f"Already in parsed_pages: {already_count:,} records")

In [None]:
# Extract parseland response fields from the enriched table
# and combine with taxicab metadata
seed_data = (
    enriched
    .join(already_seeded, "taxicab_id", "left_anti")
    .select(
        F.col("taxicab_id"),
        F.col("url"),
        F.col("resolved_url"),
        F.col("native_id"),
        F.col("native_id_namespace"),
        F.col("parser_response.authors").alias("authors"),
        F.col("parser_response.urls").alias("urls"),
        F.col("parser_response.license").alias("license"),
        F.col("parser_response.version").alias("version"),
        F.col("parser_response.abstract").alias("abstract"),
        F.col("parser_response.had_error").alias("had_error"),
        F.col("created_date").alias("parsed_date")
    )
)

new_count = seed_data.count()
print(f"New records to seed: {new_count:,}")

In [None]:
# Append to parsed_pages
seed_data.write.mode("append").format("delta").saveAsTable("openalex.parseland.parsed_pages")

final_count = spark.read.table("openalex.parseland.parsed_pages").count()
print(f"parsed_pages total after seed: {final_count:,}")