In [0]:
create or replace temp view openalex_base as (
  select
    *
  from
    openalex.works.openalex_works
  where
    indexed_in_crossref
);

create or replace temp view original_crossref_types as (
  select
    doi,
    type as original_type
  from
    openalex.crossref.crossref_exploded
  qualify row_number() over (partition by doi order by indexed.`date-time` desc) = 1 -- duplicates in here since records update over time and we don't dedup until crossref_works
);

-- Read in manual override table
create or replace temp view oa_manual_override as (
  select
    doi,
    true as remove_oa_locations,
    response_jsonb
  from
    openalex.unpaywall.oa_manual
  where 
    response_jsonb = "{}"
);

create or replace temp view wunpaywall_base as (
  select
    replace(doi, "https://doi.org/", "") as doi,
    doi as doi_url,
    title,
    type,
    publication_date as published_date,
    publication_year as year,
    -- These five journal fields are only captured in OpenAlex on the level of granularity of the location.
    get(
      filter(
        locations,
        x -> x.is_unpaywall_record = True
      ),
      0
    ).source.display_name as journal_name,
    array_join(
      array_sort(
        get(
          filter(
            locations,
            x -> x.is_unpaywall_record = True
          ),
          0
        ).source.issns
      ),
      ','
    ) as journal_issns,
    get(
      filter(
        locations,
        x -> x.is_unpaywall_record = True
      ),
      0
    ).source.issn_l as journal_issn_l,
    get(
      filter(
        locations,
        x -> x.is_unpaywall_record = True
      ),
      0
    ).source.is_oa as journal_is_oa,
    get(
      filter(
        locations,
        x -> x.is_unpaywall_record = True
      ),
      0
    ).source.is_in_doaj as journal_is_in_doaj,
    publisher,
    
    -- First, sort all OA locations just once by priority and URL
    (
      select
        array_sort(
          filter(
            locations,
            x -> (x.is_oa) AND (x.pdf_url IS NOT NULL OR x.landing_page_url IS NOT NULL)
          ),
          -- Sort using the priority directly, then URL for stability
          (a, b) -> case 
            -- Primary sort: Use the priority directly
            when (case
              when a.oa_status = 1 then 1 -- gold (diamond converted to gold)
              when a.oa_status = 2 then 1 -- gold
              when a.oa_status = 3 then 2 -- hybrid
              when a.oa_status = 4 then 3 -- bronze
              when a.oa_status = 5 then 4 -- green
              else 5 -- closed gets lowest priority
            end) < (case
              when b.oa_status = 1 then 1 -- gold (diamond converted to gold)
              when b.oa_status = 2 then 1 -- gold
              when b.oa_status = 3 then 2 -- hybrid
              when b.oa_status = 4 then 3 -- bronze
              when b.oa_status = 5 then 4 -- green
              else 5 -- closed gets lowest priority
            end) then -1
            when (case
              when a.oa_status = 1 then 1 -- gold (diamond converted to gold)
              when a.oa_status = 2 then 1 -- gold
              when a.oa_status = 3 then 2 -- hybrid
              when a.oa_status = 4 then 3 -- bronze
              when a.oa_status = 5 then 4 -- green
              else 5 -- closed gets lowest priority
            end) > (case
              when b.oa_status = 1 then 1 -- gold (diamond converted to gold)
              when b.oa_status = 2 then 1 -- gold
              when b.oa_status = 3 then 2 -- hybrid
              when b.oa_status = 4 then 3 -- bronze
              when b.oa_status = 5 then 4 -- green
              else 5 -- closed gets lowest priority
            end) then 1
            
            else case
              -- by url
              when coalesce(a.pdf_url, a.landing_page_url) < coalesce(b.pdf_url, b.landing_page_url) then -1
              when coalesce(a.pdf_url, a.landing_page_url) > coalesce(b.pdf_url, b.landing_page_url) then 1

              -- by repository institution name
              when coalesce(
                case when a.source.type = 'repository' then a.source.host_organization_name else '' end, 
                ''
              ) < coalesce(
                case when b.source.type = 'repository' then b.source.host_organization_name else '' end, 
                ''
              ) then -1
              when coalesce(
                case when a.source.type = 'repository' then a.source.host_organization_name else '' end, 
                ''
              ) > coalesce(
                case when b.source.type = 'repository' then b.source.host_organization_name else '' end, 
                ''
              ) then 1

              -- by pmh_id
              when coalesce(a.pmh_id, '') < coalesce(b.pmh_id, '') then -1
              when coalesce(a.pmh_id, '') > coalesce(b.pmh_id, '') then 1

              -- by endpoint_id
              when coalesce(a.endpoint_id, '') < coalesce(b.endpoint_id, '') then -1
              when coalesce(a.endpoint_id, '') > coalesce(b.endpoint_id, '') then 1

              else 0
            end
            
          end
        )
    ) as sorted_raw_locations,
    
    -- Use the first element as the best_oa_location
    case 
      when size(sorted_raw_locations) > 0 then 
        get(sorted_raw_locations, 0)
    end as best_oa_location,
    
    -- Transform the sorted locations into our desired format
    transform(
      sorted_raw_locations,
      y -> struct(
        coalesce(y.pdf_url, y.landing_page_url) as url,
        y.pdf_url as url_for_pdf,
        y.landing_page_url as url_for_landing_page,
        'deprecated' as evidence,
        y.license,
        y.version,
        y.host_type,
        -- Mark as best if it's the first element in the sorted array
        case when size(sorted_raw_locations) > 0 and y = get(sorted_raw_locations, 0) then true else false end as is_best,
        y.pmh_id,
        y.endpoint_id,
        case
          when y.source.type = 'repository' then y.source.host_organization_name
          else null
        end as repository_institution,
        published_date as oa_date,
        'deprecated' as updated
      )
    ) as sorted_oa_locations,
    
    array_min(transform(sorted_raw_locations, z -> z.oa_status)) as oa_status_num,
    case
      when oa_status_num = 1 then 'gold' -- no diamond in unpaywall, convert to gold.
      when oa_status_num = 2 then 'gold'
      when oa_status_num = 3 then 'hybrid'
      when oa_status_num = 4 then 'bronze'
      when oa_status_num = 5 then 'green'
      else 'closed'
    end as oa_status,
    updated_date as updated,
    2 as data_standard,
    slice(authorships, 1, 100) as z_authors -- limit to first 100
  from
    openalex_base
);

create or replace temp view wunpaywall as (
  select
    doi,
    doi_url,
    title,
    coalesce(original_type, type) as genre,
    genre in (
      "book-series",
      "component",
      "journal",
      "journal-issue",
      "journal-volume",
      "proceedings",
      "proceedings-series",
      "report-series"
    ) as is_paratext,
    -- Grabbed this list that defines 'paratext' from Casey's code in Crossref dlt notebook.
    published_date,
    year,
    journal_name,
    journal_issns,
    journal_issn_l,
    journal_is_oa,
    journal_is_in_doaj,
    publisher,
    size(sorted_oa_locations) > 0 as is_oa,
    oa_status,
    array_contains(sorted_oa_locations.host_type, 'repository') as has_repository_copy,
    
    -- Create best_oa_location struct in a deterministic way
    case when best_oa_location is not null then struct(
      coalesce(best_oa_location.pdf_url, best_oa_location.landing_page_url) as url,
      best_oa_location.pdf_url as url_for_pdf,
      best_oa_location.landing_page_url as url_for_landing_page,
      'deprecated' as evidence,
      best_oa_location.license,
      best_oa_location.version,
      best_oa_location.host_type,
      true as is_best,
      best_oa_location.pmh_id,
      best_oa_location.endpoint_id,
      case
          when best_oa_location.source.type = 'repository' then best_oa_location.source.host_organization_name
          else null
      end as repository_institution,
      published_date as oa_date,
      'deprecated' as updated
    ) end as best_oa_location,
    
    -- first_oa_location is the same as best_oa_location for consistency
    case when best_oa_location is not null then struct(
      coalesce(best_oa_location.pdf_url, best_oa_location.landing_page_url) as url,
      best_oa_location.pdf_url as url_for_pdf,
      best_oa_location.landing_page_url as url_for_landing_page,
      'deprecated' as evidence,
      best_oa_location.license,
      best_oa_location.version,
      best_oa_location.host_type,
      true as is_best,
      best_oa_location.pmh_id,
      best_oa_location.endpoint_id,
      case
          when best_oa_location.source.type = 'repository' then best_oa_location.source.host_organization_name
          else null
      end as repository_institution,
      published_date as oa_date,
      'deprecated' as updated
    ) end as first_oa_location,
    sorted_oa_locations as oa_locations,
    array() as oa_locations_embargoed,
    updated,
    data_standard,
    z_authors
  from
    wunpaywall_base
    left join original_crossref_types using(doi)
);

create or replace temp view wunpaywall_manual_overwrite as (
  select
    lower(doi) as doi,
    -- all dois should be lowercase
    lower(doi_url) as doi_url,
    -- doi's are case insensitive, so we aren't breaking any links here.
    title,
    genre,
    is_paratext,
    published_date,
    year,
    journal_name,
    journal_issns,
    journal_issn_l,
    journal_is_oa,
    journal_is_in_doaj,
    publisher,
    case
      when remove_oa_locations then false
      else is_oa
    end as is_oa,
    case
      when remove_oa_locations then "closed"
      else oa_status
    end as oa_status,
    has_repository_copy,
    case
      when remove_oa_locations then null
      else best_oa_location
    end as best_oa_location,
    case
      when remove_oa_locations then null
      else first_oa_location
    end as first_oa_location,
    case
      when remove_oa_locations then null
      else oa_locations
    end as oa_locations,
    array() as oa_locations_embargoed,
    updated,
    data_standard,
    z_authors
  from
    wunpaywall
    left join oa_manual_override using(doi)
);

-- new steps

-- JSON without updated timestamp
CREATE OR REPLACE TEMP VIEW to_json AS
SELECT
    doi,
    to_json(
        named_struct(
            'doi',                 doi,
            'doi_url',             doi_url,
            'title',               title,
            'genre',               genre,
            'is_paratext',         is_paratext,
            'published_date',      published_date,
            'year',                year,
            'journal_name',        journal_name,
            'journal_issns',       journal_issns,
            'journal_issn_l',      journal_issn_l,
            'journal_is_oa',       journal_is_oa,
            'journal_is_in_doaj',  journal_is_in_doaj,
            'publisher',           publisher,
            'is_oa',               is_oa,
            'oa_status',           oa_status,
            'has_repository_copy', has_repository_copy,
            'best_oa_location',    best_oa_location,
            'first_oa_location',   first_oa_location,
            'oa_locations',        oa_locations,
            'oa_locations_embargoed', oa_locations_embargoed,
            'data_standard',       data_standard,
            'z_authors',           z_authors
        ),
        map('ignoreNullFields','false')
    ) AS json_response
FROM wunpaywall_manual_overwrite;

-- STAGING TABLE (no timestamp inside JSON)
CREATE OR REPLACE TABLE openalex.unpaywall.unpaywall_staging
AS
SELECT
       doi,
       json_response                       AS json_payload,
       current_timestamp()                 AS stage_ts
FROM   to_json;

-- PRODUCTION TABLE  (stores JSON *with* timestamp)  
CREATE TABLE IF NOT EXISTS openalex.unpaywall.unpaywall
( doi           STRING PRIMARY KEY,
  json_response STRING,
  updated_date  TIMESTAMP,
  content_hash  STRING
) CLUSTER BY AUTO; -- Added CLUSTER BY clause for Liquid Clustering;

-- Add deduplication step before the merge
CREATE OR REPLACE TEMP VIEW unpaywall_staging_deduplicated AS
SELECT
    doi,
    json_payload,
    stage_ts
FROM (
    SELECT
        doi,
        json_payload,
        stage_ts,
        ROW_NUMBER() OVER (PARTITION BY doi ORDER BY stage_ts DESC) as row_num
    FROM openalex.unpaywall.unpaywall_staging
)
WHERE row_num = 1;

-- PREPARE STAGE VIEW (inject TS + compute hash) 
CREATE OR REPLACE TEMP VIEW unpaywall_staging_prepared AS
SELECT
    doi,

    /* put the new timestamp INSIDE the JSON */
    regexp_replace(
      json_payload,
      '}$',
      ', "updated":"' || date_format(stage_ts, 'yyyy-MM-dd\'T\'HH:mm:ss\'Z\'') || '"}'
    ) AS json_response,

    stage_ts AS updated_date,

    /* hash **without** the updated key so we detect real changes */
    sha2(json_payload, 256) AS content_hash

FROM unpaywall_staging_deduplicated;

-- MERGE  (stage → production)
MERGE INTO openalex.unpaywall.unpaywall      AS tgt
USING      unpaywall_staging_prepared         AS src
ON tgt.doi = src.doi

WHEN MATCHED AND tgt.content_hash <> src.content_hash THEN
  UPDATE SET
        tgt.json_response = src.json_response,
        tgt.updated_date  = src.updated_date,
        tgt.content_hash  = src.content_hash

WHEN NOT MATCHED THEN
  INSERT (doi, json_response, updated_date, content_hash)
  VALUES (src.doi, src.json_response, src.updated_date, src.content_hash);