In [0]:
-- STEP 1: Ensure production table exists
CREATE TABLE IF NOT EXISTS identifier('openalex' || :env_suffix || '.unpaywall.unpaywall')
( doi           STRING PRIMARY KEY,
  json_response STRING,
  updated_date  TIMESTAMP,
  content_hash  STRING
) CLUSTER BY (doi, content_hash);

-- STEP 2: BUILD dataset and MERGE into production - consider filtering openalex_works for changes only
-- for example: WHERE updated_date >= (SELECT MAX(updated_date) FROM openalex.unpaywall.unpaywall
WITH openalex_base AS (
    SELECT *
    FROM identifier('openalex' || :env_suffix || '.works.openalex_works')
    WHERE indexed_in_crossref
),

original_crossref_types AS (
    SELECT native_id as doi, type AS original_type
    FROM identifier('openalex' || :env_suffix || '.crossref.crossref_exploded')
    QUALIFY ROW_NUMBER() OVER (PARTITION BY doi ORDER BY indexed.`date-time` DESC) = 1
),

oa_manual_override AS (
    SELECT doi, true AS remove_oa_locations, response_jsonb
    FROM openalex.unpaywall.oa_manual
    WHERE response_jsonb = "{}"
),

wunpaywall_base AS (
    SELECT
        REPLACE(doi, "https://doi.org/", "") AS doi,
        doi AS doi_url,
        title,
        type,
        publication_date AS published_date,
        publication_year AS year,
        GET(FILTER(locations, x -> x.is_unpaywall_record = TRUE), 0).source.display_name AS journal_name,
        ARRAY_JOIN(ARRAY_SORT(GET(FILTER(locations, x -> x.is_unpaywall_record = TRUE), 0).source.issn), ',') AS journal_issns,
        GET(FILTER(locations, x -> x.is_unpaywall_record = TRUE), 0).source.issn_l AS journal_issn_l,
        GET(FILTER(locations, x -> x.is_unpaywall_record = TRUE), 0).source.is_oa AS journal_is_oa,
        GET(FILTER(locations, x -> x.is_unpaywall_record = TRUE), 0).source.is_in_doaj AS journal_is_in_doaj,
        publisher,
        (
            SELECT ARRAY_SORT(
                FILTER(
                  locations,
                  -- filter locations
                  x -> (x.is_oa)
                    AND (x.provenance IS NOT NULL AND x.provenance != 'datacite')
                    AND (x.pdf_url IS NOT NULL OR x.landing_page_url IS NOT NULL)
                    AND (x.landing_page_url IS NULL OR x.landing_page_url NOT LIKE '%doaj.org%')
                ),
                (a, b) -> CASE 
                    WHEN (CASE WHEN a.oa_status IN (1, 2) THEN 1 WHEN a.oa_status = 3 THEN 2 WHEN a.oa_status = 4 THEN 3 WHEN a.oa_status = 5 THEN 4 ELSE 5 END)
                       < (CASE WHEN b.oa_status IN (1, 2) THEN 1 WHEN b.oa_status = 3 THEN 2 WHEN b.oa_status = 4 THEN 3 WHEN b.oa_status = 5 THEN 4 ELSE 5 END) THEN -1
                    WHEN (CASE WHEN a.oa_status IN (1, 2) THEN 1 WHEN a.oa_status = 3 THEN 2 WHEN a.oa_status = 4 THEN 3 WHEN a.oa_status = 5 THEN 4 ELSE 5 END)
                       > (CASE WHEN b.oa_status IN (1, 2) THEN 1 WHEN b.oa_status = 3 THEN 2 WHEN b.oa_status = 4 THEN 3 WHEN b.oa_status = 5 THEN 4 ELSE 5 END) THEN 1
                    ELSE CASE
                        WHEN COALESCE(a.pdf_url, a.landing_page_url) < COALESCE(b.pdf_url, b.landing_page_url) THEN -1
                        WHEN COALESCE(a.pdf_url, a.landing_page_url) > COALESCE(b.pdf_url, b.landing_page_url) THEN 1
                        WHEN COALESCE(CASE WHEN a.source.type = 'repository' THEN a.source.host_organization_name ELSE '' END, '') 
                           < COALESCE(CASE WHEN b.source.type = 'repository' THEN b.source.host_organization_name ELSE '' END, '') THEN -1
                        WHEN COALESCE(CASE WHEN a.source.type = 'repository' THEN a.source.host_organization_name ELSE '' END, '') 
                           > COALESCE(CASE WHEN b.source.type = 'repository' THEN b.source.host_organization_name ELSE '' END, '') THEN 1
                        WHEN COALESCE(a.pmh_id, '') < COALESCE(b.pmh_id, '') THEN -1
                        WHEN COALESCE(a.pmh_id, '') > COALESCE(b.pmh_id, '') THEN 1
                        WHEN COALESCE(a.endpoint_id, '') < COALESCE(b.endpoint_id, '') THEN -1
                        WHEN COALESCE(a.endpoint_id, '') > COALESCE(b.endpoint_id, '') THEN 1
                        ELSE 0
                    END
                END
            )
        ) AS sorted_raw_locations,
        CASE WHEN SIZE(sorted_raw_locations) > 0 THEN GET(sorted_raw_locations, 0) END AS best_oa_location,
        TRANSFORM(sorted_raw_locations, y -> STRUCT(
            COALESCE(y.pdf_url, y.landing_page_url) AS url,
            y.pdf_url AS url_for_pdf,
            y.landing_page_url AS url_for_landing_page,
            'deprecated' AS evidence,
            y.license,
            CASE
                WHEN y.version is null AND y.host_type = 'repository' THEN 'submittedVersion' -- fallback if version is null
                WHEN y.version is null AND y.host_type = 'publisher' THEN 'publishedVersion'
                ELSE y.version
            END AS version,
            y.host_type,
            CASE WHEN SIZE(sorted_raw_locations) > 0 AND y = GET(sorted_raw_locations, 0) THEN TRUE ELSE FALSE END AS is_best,
            y.pmh_id,
            y.endpoint_id,
            CASE WHEN y.source.type = 'repository' THEN y.source.host_organization_name ELSE NULL END AS repository_institution,
            published_date AS oa_date,
            'deprecated' AS updated
        )) AS sorted_oa_locations,
        ARRAY_MIN(TRANSFORM(sorted_raw_locations, z -> z.oa_status)) AS oa_status_num,
        CASE
            WHEN oa_status_num = 1 THEN 'gold'
            WHEN oa_status_num = 2 THEN 'gold'
            WHEN oa_status_num = 3 THEN 'hybrid'
            WHEN oa_status_num = 4 THEN 'bronze'
            WHEN oa_status_num = 5 THEN 'green'
            ELSE 'closed'
        END AS oa_status,
        updated_date AS updated,
        2 AS data_standard,
        TRANSFORM(
            SLICE(authorships, 1, 100), 
            author -> STRUCT(
                author.author_position,
                author.raw_author_name,
                author.is_corresponding,
                author.raw_affiliation_strings
            )
        ) AS z_authors
    FROM openalex_base
),

wunpaywall AS (
    SELECT
        doi,
        doi_url,
        title,
        COALESCE(original_type, type) AS genre,
        genre IN (
            'book-series', 'component', 'journal', 'journal-issue', 'journal-volume',
            'proceedings', 'proceedings-series', 'report-series'
        ) AS is_paratext,
        published_date,
        year,
        journal_name,
        journal_issns,
        journal_issn_l,
        journal_is_oa,
        journal_is_in_doaj,
        publisher,
        SIZE(sorted_oa_locations) > 0 AS is_oa,
        oa_status,
        ARRAY_CONTAINS(sorted_oa_locations.host_type, 'repository') AS has_repository_copy,
        CASE WHEN best_oa_location IS NOT NULL THEN STRUCT(
            COALESCE(best_oa_location.pdf_url, best_oa_location.landing_page_url) AS url,
            best_oa_location.pdf_url AS url_for_pdf,
            best_oa_location.landing_page_url AS url_for_landing_page,
            'deprecated' AS evidence,
            best_oa_location.license,
            best_oa_location.version,
            best_oa_location.host_type,
            TRUE AS is_best,
            best_oa_location.pmh_id,
            best_oa_location.endpoint_id,
            CASE WHEN best_oa_location.source.type = 'repository' THEN best_oa_location.source.host_organization_name ELSE NULL END AS repository_institution,
            published_date AS oa_date,
            'deprecated' AS updated
        ) END AS best_oa_location,
        CASE WHEN best_oa_location IS NOT NULL THEN STRUCT(
            COALESCE(best_oa_location.pdf_url, best_oa_location.landing_page_url) AS url,
            best_oa_location.pdf_url AS url_for_pdf,
            best_oa_location.landing_page_url AS url_for_landing_page,
            'deprecated' AS evidence,
            best_oa_location.license,
            best_oa_location.version,
            best_oa_location.host_type,
            TRUE AS is_best,
            best_oa_location.pmh_id,
            best_oa_location.endpoint_id,
            CASE WHEN best_oa_location.source.type = 'repository' THEN best_oa_location.source.host_organization_name ELSE NULL END AS repository_institution,
            published_date AS oa_date,
            'deprecated' AS updated
        ) END AS first_oa_location,
        sorted_oa_locations AS oa_locations,
        ARRAY() AS oa_locations_embargoed,
        updated,
        data_standard,
        z_authors
    FROM wunpaywall_base
    LEFT JOIN original_crossref_types USING (doi)
),

wunpaywall_manual_overwrite AS (
    SELECT
        LOWER(doi) AS doi,
        LOWER(doi_url) AS doi_url,
        title,
        genre,
        is_paratext,
        published_date,
        year,
        journal_name,
        journal_issns,
        journal_issn_l,
        journal_is_oa,
        journal_is_in_doaj,
        publisher,
        CASE WHEN remove_oa_locations THEN FALSE ELSE is_oa END AS is_oa,
        CASE WHEN remove_oa_locations THEN 'closed' ELSE oa_status END AS oa_status,
        has_repository_copy,
        CASE WHEN remove_oa_locations THEN NULL ELSE best_oa_location END AS best_oa_location,
        CASE WHEN remove_oa_locations THEN NULL ELSE first_oa_location END AS first_oa_location,
        CASE WHEN remove_oa_locations THEN NULL ELSE oa_locations END AS oa_locations,
        ARRAY() AS oa_locations_embargoed,
        updated,
        data_standard,
        z_authors
    FROM wunpaywall
    LEFT JOIN oa_manual_override USING (doi)
),

json_payloads AS (
  SELECT
    doi,

    -- response
    TO_JSON(NAMED_STRUCT(
      'doi', doi,
      'doi_url', doi_url,
      'title', title,
      'genre', genre,
      'is_paratext', is_paratext,
      'published_date', published_date,
      'year', year,
      'journal_name', journal_name,
      'journal_issns', journal_issns,
      'journal_issn_l', journal_issn_l,
      'journal_is_oa', journal_is_oa,
      'journal_is_in_doaj', journal_is_in_doaj,
      'publisher', publisher,
      'is_oa', is_oa,
      'oa_status', oa_status,
      'has_repository_copy', has_repository_copy,
      'best_oa_location', best_oa_location,
      'first_oa_location', first_oa_location,
      'oa_locations', oa_locations,
      'oa_locations_embargoed', oa_locations_embargoed,
      'data_standard', data_standard,
      'z_authors', z_authors
    ), MAP('ignoreNullFields', 'false')) AS json_response,

    -- for hash (no z_authors)
    TO_JSON(NAMED_STRUCT(
      'doi', doi,
      'doi_url', doi_url,
      'title', title,
      'genre', genre,
      'is_paratext', is_paratext,
      'published_date', published_date,
      'year', year,
      'journal_name', journal_name,
      'journal_issns', journal_issns,
      'journal_issn_l', journal_issn_l,
      'journal_is_oa', journal_is_oa,
      'journal_is_in_doaj', journal_is_in_doaj,
      'publisher', publisher,
      'is_oa', is_oa,
      'oa_status', oa_status,
      'has_repository_copy', has_repository_copy,
      'best_oa_location', best_oa_location,
      'first_oa_location', first_oa_location,
      'oa_locations', oa_locations,
      'oa_locations_embargoed', oa_locations_embargoed,
      'data_standard', data_standard
    ), MAP('ignoreNullFields', 'false')) AS json_for_hash
  FROM wunpaywall_manual_overwrite
),

unpaywall_staging_prepared AS (
    SELECT
        doi,
        REGEXP_REPLACE(json_response, '}$', ', "updated":"' || DATE_FORMAT(CURRENT_TIMESTAMP(), 'yyyy-MM-dd\'T\'HH:mm:ss\'Z\'') || '"}') AS json_with_ts,
        CURRENT_TIMESTAMP() AS updated_date,
        SHA2(json_for_hash, 256) AS content_hash
    FROM json_payloads
    QUALIFY ROW_NUMBER() OVER (PARTITION BY doi ORDER BY updated_date DESC) = 1
)

MERGE INTO identifier('openalex' || :env_suffix || '.unpaywall.unpaywall') AS tgt
USING unpaywall_staging_prepared AS src
ON tgt.doi = src.doi
WHEN MATCHED AND tgt.content_hash <> src.content_hash THEN
  UPDATE SET
    tgt.json_response = src.json_with_ts,
    tgt.updated_date = src.updated_date,
    tgt.content_hash = src.content_hash
WHEN NOT MATCHED THEN
  INSERT (doi, json_response, updated_date, content_hash)
  VALUES (src.doi, src.json_with_ts, src.updated_date, src.content_hash);