### Ingest awards from "Gateway to Research" parquet file

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.gtr_awards
USING delta 
AS
WITH target_funder_list AS (
    SELECT explode(array(
        'AHRC', 'BBSRC', 'EPSRC', 'ESRC', 
        'MRC', 'NERC', 'STFC', 'INNOVATE UK'
    )) AS funder
),
locations_mapped AS (
    SELECT work_id, native_id, native_id_namespace
    FROM openalex.works.locations_mapped
),
exploded_awards AS (
    SELECT
        split(doi, 'doi.org/')[1] AS doi,
        explode(grant_reference) AS raw_grant
    FROM
        parquet.`s3a://openalex-ingest/awards/GatewayToResearch_2025-11-24.parquet`
    WHERE
        doi IS NOT NULL
        AND size(grant_reference) > 0
),
awards AS (
    SELECT
        doi,
        -- Split raw_grant by ':' to get ID (index 0) and Funder (index 1)
        split(raw_grant, ':')[0] AS funder_award_id,
        split(raw_grant, ':')[1] AS funder_name
    FROM
        exploded_awards
    WHERE
      split(raw_grant, ':')[1] IN (SELECT funder FROM target_funder_list)
),
funders AS (
    SELECT DISTINCT funder_id, alternate_title, display_name, ror_id, doi
    FROM openalex.common.funder
    LATERAL VIEW explode(from_json(alternate_titles, 'array<string>')) as alternate_title
    WHERE 
        alternate_title IN (SELECT funder FROM target_funder_list)
        AND location = 'United Kingdom'
)
SELECT
    -- Generate unique ID using xxhash64 of funder_id:funder_award_id
    abs(xxhash64(CONCAT(f.funder_id, ':', lower(a.funder_award_id)))) % 9000000000 as id,
    
    -- Display name (not available in this source)
    CAST(NULL AS STRING) as display_name,
    
    -- Description (not available in this source)
    CAST(NULL AS STRING) as description,
    
    -- Funder info
    f.funder_id as funder_id,
    a.funder_award_id,
    
    -- Amount (not available in this source)
    CAST(NULL AS DOUBLE) as amount,
    CAST(NULL AS STRING) as currency,
    
    -- Funder struct
    struct(
        concat("https://openalex.org/F", f.funder_id) as id,
        f.display_name,
        f.ror_id,
        f.doi
    ) as funder,
    
    -- Funding type (not available in this source)
    CAST(NULL AS STRING) as funding_type,
    
    -- Funder scheme (not available in this source)
    CAST(NULL AS STRING) as funder_scheme,
    
    -- Provenance
    'gateway_to_research' as provenance,
    
    -- Dates (not available in this source)
    CAST(NULL AS DATE) as start_date,
    CAST(NULL AS DATE) as end_date,
    CAST(NULL AS INT) as start_year,
    CAST(NULL AS INT) as end_year,
    
    -- Lead investigator (not available in this source)
    CAST(NULL AS STRUCT<
        given_name:STRING,
        family_name:STRING,
        orcid:STRING,
        role_start:DATE,
        affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
    >) as lead_investigator,
    
    -- Co-lead investigator (not available in this source)
    CAST(NULL AS STRUCT<
        given_name:STRING,
        family_name:STRING,
        orcid:STRING,
        role_start:DATE,
        affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
    >) as co_lead_investigator,
    
    -- Investigators (not available in this source)
    CAST(NULL AS ARRAY<STRUCT<
        given_name:STRING,
        family_name:STRING,
        orcid:STRING,
        role_start:DATE,
        affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
    >>) as investigators,
    
    -- Landing page URL (not available in this source)
    CAST(NULL AS STRING) as landing_page_url,
    
    -- DOI (not available in this source)
    CAST(NULL AS STRING) as doi,
    
    -- Works API URL
    concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(f.funder_id, ':', lower(a.funder_award_id)))) % 9000000000) as works_api_url,
    
    -- Timestamps
    current_timestamp() as created_date,
    current_timestamp() as updated_date,
    
    -- Keep work linkage info for reference (legacy columns)
    lm.work_id,
    a.doi as work_doi
    
FROM awards a
    JOIN funders f
        ON a.funder_name = f.alternate_title
    LEFT JOIN locations_mapped lm
        ON lm.native_id = a.doi

In [None]:
%sql
-- Insert into openalex_awards (excluding legacy work_id and work_doi columns)
INSERT INTO openalex.awards.openalex_awards
SELECT
    id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    works_api_url,
    created_date,
    updated_date,
    ARRAY() as funded_outputs,
    0 as funded_outputs_count
FROM openalex.awards.gtr_awards;