# Create GTR Project Awards from Gateway to Research API

Creates GTR awards from Gateway to Research API data. ~171K unique awards with full metadata.

**Prerequisites:**
- Run `scripts/local/gtr_to_s3.py` to download and upload the data first.

**Data source:** https://gtr.ukri.org/gtr/api  
**S3 location:** `s3a://openalex-ingest/awards/gtr/gtr_projects.parquet`

**UK Research Councils:**
- AHRC (Arts and Humanities Research Council)
- BBSRC (Biotechnology and Biological Sciences Research Council)
- EPSRC (Engineering and Physical Sciences Research Council)
- ESRC (Economic and Social Research Council)
- MRC (Medical Research Council)
- NERC (Natural Environment Research Council)
- STFC (Science and Technology Facilities Council)
- Innovate UK

## Step 1: Create Staging Table from S3

In [None]:
%sql
-- Create the staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.gtr_projects_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as databricks_ingested_at
FROM parquet.`s3a://openalex-ingest/awards/gtr/gtr_projects.parquet`;

In [None]:
%sql
-- Check row count (should be ~171K)
SELECT COUNT(*) as total_projects FROM openalex.awards.gtr_projects_raw;

In [None]:
%sql
-- Sample the raw data
SELECT * FROM openalex.awards.gtr_projects_raw LIMIT 5;

## Step 2: Create GTR Project Awards Table

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.gtr_project_awards
USING delta
AS
WITH
-- Map grant ID prefix directly to OpenAlex funder_id
-- Using explicit funder_ids avoids alternate_title collisions (e.g., "NERC" matching both
-- Natural Environment Research Council AND Sight Research UK which was formerly 
-- "National Eye Research Centre")
prefix_to_funder AS (
    SELECT * FROM (VALUES
        ('EP/', 4320334627),  -- EPSRC
        ('MR/', 4320334626),  -- MRC
        ('ST/', 4320334632),  -- STFC
        ('BB/', 4320334629),  -- BBSRC
        ('NE/', 4320334631),  -- NERC
        ('ES/', 4320334630),  -- ESRC
        ('AH/', 4320334609)   -- AHRC
    ) AS t(prefix, funder_id)
),

-- Get OpenAlex funder records for UK research councils using explicit funder_ids
funders AS (
    SELECT 
        funder_id, 
        display_name, 
        ror_id, 
        doi
    FROM openalex.common.funder
    WHERE funder_id IN (
        4320334609,  -- Arts and Humanities Research Council (AHRC)
        4320334629,  -- Biotechnology and Biological Sciences Research Council (BBSRC)
        4320334627,  -- Engineering and Physical Sciences Research Council (EPSRC)
        4320334630,  -- Economic and Social Research Council (ESRC)
        4320334626,  -- Medical Research Council (MRC)
        4320334631,  -- Natural Environment Research Council (NERC)
        4320334632,  -- Science and Technology Facilities Council (STFC)
        4320335087   -- Innovate UK
    )
),

-- List of council names for identifying programmes vs standard council grants
council_names AS (
    SELECT explode(array(
        'AHRC', 'BBSRC', 'EPSRC', 'ESRC', 'MRC', 'NERC', 'STFC', 'Innovate UK',
        'Arts and Humanities Research Council',
        'Biotechnology and Biological Sciences Research Council',
        'Engineering and Physical Sciences Research Council',
        'Economic and Social Research Council',
        'Medical Research Council',
        'Natural Environment Research Council',
        'Science and Technology Facilities Council'
    )) AS council_name
),

-- Parse grant reference to extract funder and programme
parsed AS (
    SELECT
        g.*,
        -- Extract prefix (e.g., "EP/" from "EP/Y036530/1")
        CASE 
            WHEN g.grant_reference IS NOT NULL AND LENGTH(g.grant_reference) >= 3
            THEN CONCAT(SUBSTRING(g.grant_reference, 1, 2), '/')
            ELSE NULL
        END as grant_prefix,
        
        -- Determine if lead_funder is a council name or a programme name
        -- If it's NOT a council name, it's a programme (UKRI FLF, GCRF, etc.)
        CASE
            WHEN g.lead_funder IN (SELECT council_name FROM council_names)
            THEN NULL  -- Standard council grant, no special programme
            ELSE g.lead_funder  -- This is a programme name
        END as programme
    FROM openalex.awards.gtr_projects_raw g
    WHERE g.grant_reference IS NOT NULL
),

-- Join to get funder_id from grant prefix or lead_funder
with_funder AS (
    SELECT
        p.*,
        -- Use prefix to determine funder_id, fall back to lead_funder matching
        COALESCE(
            ptf.funder_id,
            CASE 
                WHEN p.lead_funder = 'Innovate UK' THEN 4320335087
                WHEN p.lead_funder LIKE '%EPSRC%' THEN 4320334627
                WHEN p.lead_funder LIKE '%MRC%' THEN 4320334626
                WHEN p.lead_funder LIKE '%BBSRC%' THEN 4320334629
                WHEN p.lead_funder LIKE '%NERC%' THEN 4320334631
                WHEN p.lead_funder LIKE '%ESRC%' THEN 4320334630
                WHEN p.lead_funder LIKE '%AHRC%' THEN 4320334609
                WHEN p.lead_funder LIKE '%STFC%' THEN 4320334632
                ELSE NULL
            END
        ) as matched_funder_id
    FROM parsed p
    LEFT JOIN prefix_to_funder ptf ON p.grant_prefix = ptf.prefix
),

-- Final join with funders
awards_transformed AS (
    SELECT
        -- Generate unique ID using xxhash64 of funder_id:grant_reference
        abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(g.grant_reference)))) % 9000000000 as id,
        
        -- Display name = project title
        g.title as display_name,
        
        -- Description = abstract
        g.abstract as description,
        
        -- Funder info
        f.funder_id,
        g.grant_reference as funder_award_id,
        
        -- Funding amount
        CAST(g.amount AS DOUBLE) as amount,
        'GBP' as currency,
        
        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', f.funder_id) as id,
            f.display_name,
            f.ror_id,
            f.doi
        ) as funder,
        
        -- Map grant_category to funding_type
        CASE
            WHEN g.grant_category = 'Research Grant' THEN 'research'
            WHEN g.grant_category = 'Fellowship' THEN 'fellowship'
            WHEN g.grant_category = 'Training Grant' THEN 'training'
            WHEN g.grant_category = 'Studentship' THEN 'studentship'
            WHEN g.grant_category = 'Vouchers' THEN 'voucher'
            WHEN g.grant_category IS NULL THEN NULL
            ELSE LOWER(REPLACE(g.grant_category, ' ', '_'))
        END as funding_type,
        
        -- Programme name as funder_scheme (UKRI FLF, GCRF, etc.) or NULL
        g.programme as funder_scheme,
        
        -- Provenance
        'gateway_to_research' as provenance,
        
        -- Dates
        CAST(g.start_date AS DATE) as start_date,
        CAST(g.end_date AS DATE) as end_date,
        YEAR(g.start_date) as start_year,
        YEAR(g.end_date) as end_year,
        
        -- Lead investigator
        CASE
            WHEN g.pi_family_name IS NOT NULL THEN
                struct(
                    g.pi_given_name as given_name,
                    g.pi_family_name as family_name,
                    CAST(NULL AS STRING) as orcid,
                    CAST(NULL AS DATE) as role_start,
                    struct(
                        g.lead_org_name as name,
                        'United Kingdom' as country,
                        CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>) as ids
                    ) as affiliation
                )
            ELSE NULL
        END as lead_investigator,
        
        -- Co-lead and other investigators (not available in basic API)
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,
        
        CAST(NULL AS ARRAY<STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >>) as investigators,
        
        -- Landing page URL
        CONCAT('https://gtr.ukri.org/projects?ref=', g.grant_reference) as landing_page_url,
        
        -- No DOI for GTR grants
        CAST(NULL AS STRING) as doi,
        
        -- Works API URL
        concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(g.grant_reference)))) % 9000000000) as works_api_url,
        
        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date
        
    FROM with_funder g
    JOIN funders f ON g.matched_funder_id = f.funder_id
    WHERE g.matched_funder_id IS NOT NULL
)

SELECT * FROM awards_transformed;

## Verification Queries

In [None]:
%sql
-- Check row count
SELECT COUNT(*) as total_gtr_project_awards FROM openalex.awards.gtr_project_awards;

In [None]:
%sql
-- Sample the data
SELECT * FROM openalex.awards.gtr_project_awards LIMIT 10;

In [None]:
%sql
-- Check funder distribution
SELECT funder.display_name, COUNT(*) as cnt
FROM openalex.awards.gtr_project_awards
GROUP BY funder.display_name
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funding_type distribution
SELECT funding_type, COUNT(*) as cnt
FROM openalex.awards.gtr_project_awards
GROUP BY funding_type
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funder_scheme (programme) distribution
SELECT funder_scheme, COUNT(*) as cnt
FROM openalex.awards.gtr_project_awards
WHERE funder_scheme IS NOT NULL
GROUP BY funder_scheme
ORDER BY cnt DESC
LIMIT 30;

In [None]:
%sql
-- Check data completeness
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(description) as has_abstract,
    COUNT(amount) as has_amount,
    COUNT(start_date) as has_start_date,
    COUNT(lead_investigator) as has_pi,
    COUNT(funder_scheme) as has_programme,
    ROUND(COUNT(amount) * 100.0 / COUNT(*), 1) as pct_with_amount,
    ROUND(COUNT(lead_investigator) * 100.0 / COUNT(*), 1) as pct_with_pi
FROM openalex.awards.gtr_project_awards;

In [None]:
%sql
-- Check year distribution
SELECT start_year, COUNT(*) as cnt
FROM openalex.awards.gtr_project_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC
LIMIT 20;