### Combine Crossref Awards and Backfill Awards to create single Awards table

In [0]:
%sql
CREATE OR REPLACE TABLE openalex.awards.openalex_awards
USING delta
AS
WITH award_aggregates AS (
  SELECT
    CAST(REPLACE(award.id, 'https://openalex.org/G', '') AS BIGINT) as award_id,
    TRANSFORM(
      COLLECT_LIST(DISTINCT id), 
      w -> CONCAT('https://openalex.org/W', w)
    ) as funded_outputs,
    COUNT(DISTINCT id) as funded_outputs_count
  FROM openalex.works.openalex_works
  LATERAL VIEW explode(awards) as award
  WHERE size(awards) > 0
  GROUP BY award.id
),
combined AS (
  SELECT
    abs(xxhash64(id)) % 9000000000 as id,
    NULL as display_name,
    NULL as description,
    funder_id,
    funder_award_id,
    NULL as amount,
    NULL as currency,
    struct(
      funder.id,
      funder.display_name,
      funder.ror_id,
      funder.doi
    ) as funder,
    NULL as funding_type,
    NULL as funder_scheme,
    'crossref_work.grants' as provenance,
    NULL as start_date,
    NULL as end_date,
    NULL as start_year,
    NULL as end_year,
    NULL as lead_investigator,
    NULL as co_lead_investigator,
    NULL as investigators,
    NULL as landing_page_url,
    NULL as doi,
    concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(id)) % 9000000000) as works_api_url,
    created_date,
    updated_date,
    2 as priority  -- lower priority for backfill
  FROM openalex.awards.backfill_awards
  
  UNION ALL
  
  SELECT 
    abs(xxhash64(id)) % 9000000000 as id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    'crossref_work' as provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(id)) % 9000000000) as works_api_url,
    created_date,
    updated_date,
    1 as priority  -- higher priority for crossref
  FROM openalex.awards.crossref_awards

  UNION ALL

  SELECT
    abs(xxhash64(id)) % 9000000000 as id,
    NULL as display_name,
    NULL as description,
    funder_id,
    funder_award_id,
    NULL as amount,
    NULL as currency,
    funder,
    NULL as funding_type,
    NULL as funder_scheme,
    'gateway_to_research' as provenance,
    NULL as start_date,
    NULL as end_date,
    NULL as start_year,
    NULL as end_year,
    NULL as lead_investigator,
    NULL as co_lead_investigator,
    NULL as investigators,
    NULL as landing_page_url,
    NULL as doi,
    concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(id)) % 9000000000) as works_api_url,
    created_date,
    updated_date,
    3 as priority
  FROM openalex.awards.gtr_awards

  UNION ALL

  SELECT
    abs(xxhash64(id)) % 9000000000 as id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(id)) % 9000000000) as works_api_url,
    created_date,
    updated_date,
    0 as priority  -- high priority for NIH (rich metadata)
  FROM openalex.awards.nih_awards
),
deduplicated_awards AS (
  SELECT 
    * EXCEPT(priority, row_num)
  FROM (
    SELECT 
      *,
      ROW_NUMBER() OVER (PARTITION BY id ORDER BY priority) as row_num
    FROM combined
  )
  WHERE row_num = 1
)
SELECT 
  da.*,
  SLICE(COALESCE(aa.funded_outputs, ARRAY()), 1, 100) as funded_outputs,
  COALESCE(aa.funded_outputs_count, 0) as funded_outputs_count
FROM 
  deduplicated_awards da
  LEFT JOIN award_aggregates aa 
    ON da.id = aa.award_id