# Create Wellcome Trust Awards

Creates awards from the Wellcome Trust grants database. ~20K grants.

**Prerequisites:**
- Run `scripts/local/wellcome_to_s3.py` to download and upload the data first.

**Data source:** https://wellcome.org/grant-funding/funded-people-and-projects  
**S3 location:** `s3a://openalex-ingest/awards/wellcome/wellcome_projects.parquet`

**Wellcome Trust funder:**
- funder_id: 4320311904
- display_name: "Wellcome Trust"
- ROR: https://ror.org/029chgv08
- DOI: 10.13039/100010269

**Notes:**
- Data follows the 360Giving standard
- Lead applicant name is available for most grants
- No ORCID available in the data
- Grant programme maps to funder_scheme (e.g., "PhD Studentship", "Investigator Award")
- Currency is GBP

## Step 1: Create Staging Table from S3

In [None]:
%sql
-- Create the staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.wellcome_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as databricks_ingested_at
FROM parquet.`s3a://openalex-ingest/awards/wellcome/wellcome_projects.parquet`;

In [None]:
%sql
-- Check row count (should be ~20K)
SELECT COUNT(*) as total_grants FROM openalex.awards.wellcome_raw;

In [None]:
%sql
-- Sample the raw data and verify column names
SELECT * FROM openalex.awards.wellcome_raw LIMIT 5;

In [None]:
%sql
-- Verify column names before transformation
DESCRIBE openalex.awards.wellcome_raw;

## Step 2: Create Wellcome Awards Table

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.wellcome_awards
USING delta
AS
WITH
-- Get Wellcome Trust funder from OpenAlex by explicit funder_id
wellcome_funder AS (
    SELECT
        funder_id,
        display_name,
        ror_id,
        doi
    FROM openalex.common.funder
    WHERE funder_id = 4320311904  -- Wellcome Trust
),

-- Parse lead applicant name into given/family name
parsed_names AS (
    SELECT
        w.*,
        -- Lead applicant format varies: "Smith, John" or "John Smith"
        -- We have applicant_surname separately, so use that for family name
        w.applicant_surname as parsed_family_name,
        -- Extract given name by removing surname from full name
        CASE
            WHEN w.lead_applicant_name LIKE CONCAT(w.applicant_surname, ',%') THEN
                TRIM(SUBSTRING(w.lead_applicant_name, LENGTH(w.applicant_surname) + 2))
            WHEN w.lead_applicant_name LIKE CONCAT('% ', w.applicant_surname) THEN
                TRIM(SUBSTRING(w.lead_applicant_name, 1, LENGTH(w.lead_applicant_name) - LENGTH(w.applicant_surname) - 1))
            ELSE
                -- If pattern doesn't match, try to extract first name from lead_applicant_name
                CASE 
                    WHEN INSTR(w.lead_applicant_name, ',') > 0 THEN
                        TRIM(SUBSTRING(w.lead_applicant_name, INSTR(w.lead_applicant_name, ',') + 1))
                    WHEN INSTR(w.lead_applicant_name, ' ') > 0 THEN
                        TRIM(SUBSTRING(w.lead_applicant_name, 1, INSTR(w.lead_applicant_name, ' ') - 1))
                    ELSE NULL
                END
        END as parsed_given_name
    FROM openalex.awards.wellcome_raw w
),

awards_transformed AS (
    SELECT
        -- Generate unique ID: xxhash64 of funder_id:grant_id
        abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(w.grant_id)))) % 9000000000 as id,

        -- Display name = title
        w.title as display_name,

        -- Description
        w.description as description,

        -- Funder info
        f.funder_id,
        w.grant_id as funder_award_id,

        -- Amount in GBP
        TRY_CAST(w.amount AS DOUBLE) as amount,
        COALESCE(w.currency, 'GBP') as currency,

        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', f.funder_id) as id,
            f.display_name,
            f.ror_id,
            f.doi
        ) as funder,

        -- Funding type - map from grant programme
        CASE
            WHEN LOWER(w.grant_programme) LIKE '%phd%' THEN 'fellowship'
            WHEN LOWER(w.grant_programme) LIKE '%studentship%' THEN 'fellowship'
            WHEN LOWER(w.grant_programme) LIKE '%fellowship%' THEN 'fellowship'
            WHEN LOWER(w.grant_programme) LIKE '%scholarship%' THEN 'fellowship'
            WHEN LOWER(w.grant_programme) LIKE '%training%' THEN 'training'
            WHEN LOWER(w.grant_programme) LIKE '%career%' THEN 'fellowship'
            WHEN LOWER(w.grant_programme) LIKE '%investigator%' THEN 'research'
            WHEN LOWER(w.grant_programme) LIKE '%project%' THEN 'research'
            WHEN LOWER(w.grant_programme) LIKE '%strategic%' THEN 'research'
            WHEN LOWER(w.grant_programme) LIKE '%collaborative%' THEN 'research'
            WHEN LOWER(w.grant_programme) LIKE '%seed%' THEN 'research'
            WHEN LOWER(w.grant_programme) LIKE '%discovery%' THEN 'research'
            WHEN LOWER(w.grant_programme) LIKE '%equipment%' THEN 'infrastructure'
            WHEN LOWER(w.grant_programme) LIKE '%capital%' THEN 'infrastructure'
            WHEN LOWER(w.grant_programme) LIKE '%building%' THEN 'infrastructure'
            WHEN LOWER(w.grant_programme) LIKE '%open access%' THEN 'other'
            WHEN LOWER(w.grant_programme) LIKE '%vacation%' THEN 'fellowship'
            ELSE 'grant'
        END as funding_type,

        -- Funder scheme = grant programme
        w.grant_programme as funder_scheme,

        -- Provenance
        'wellcome_trust' as provenance,

        -- Dates
        TRY_TO_DATE(w.start_date, 'yyyy-MM-dd') as start_date,
        TRY_TO_DATE(w.end_date, 'yyyy-MM-dd') as end_date,
        YEAR(TRY_TO_DATE(w.start_date, 'yyyy-MM-dd')) as start_year,
        YEAR(TRY_TO_DATE(w.end_date, 'yyyy-MM-dd')) as end_year,

        -- Lead investigator
        CASE
            WHEN w.lead_applicant_name IS NOT NULL THEN
                struct(
                    w.parsed_given_name as given_name,
                    w.parsed_family_name as family_name,
                    CAST(NULL AS STRING) as orcid,
                    CAST(NULL AS DATE) as role_start,
                    struct(
                        w.recipient_org_name as name,
                        w.recipient_orgcountry as country,
                        CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>) as ids
                    ) as affiliation
                )
            ELSE NULL
        END as lead_investigator,

        -- Co-lead and other investigators (not available in this format)
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,

        CAST(NULL AS ARRAY<STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >>) as investigators,

        -- Landing page URL - Wellcome grants page
        CONCAT('https://wellcome.org/grant-funding/people-and-projects/grants-awarded?q=', w.grant_id) as landing_page_url,

        -- No DOI for Wellcome grants
        CAST(NULL AS STRING) as doi,

        -- Works API URL
        concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(w.grant_id)))) % 9000000000) as works_api_url,

        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date

    FROM parsed_names w
    CROSS JOIN wellcome_funder f
    WHERE w.grant_id IS NOT NULL
      AND TRIM(w.grant_id) != ''
)

SELECT * FROM awards_transformed;

In [None]:
%sql
-- Remove previous data for this source before inserting fresh data
DELETE FROM openalex.awards.openalex_awards_raw
WHERE provenance = 'wellcome_trust' AND priority = 22;

-- Insert into openalex_awards_raw with priority
INSERT INTO openalex.awards.openalex_awards_raw
SELECT
    id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    works_api_url,
    created_date,
    updated_date,
    22 as priority
FROM openalex.awards.wellcome_awards;

## Verification Queries

In [None]:
%sql
-- Check row count (should be ~20K)
SELECT COUNT(*) as total_wellcome_awards FROM openalex.awards.wellcome_awards;

In [None]:
%sql
-- Sample the data
SELECT 
    id,
    display_name,
    funder_award_id,
    funder_scheme,
    funding_type,
    amount,
    currency,
    start_date,
    end_date,
    lead_investigator.given_name as pi_given_name,
    lead_investigator.family_name as pi_family_name,
    lead_investigator.affiliation.name as institution,
    lead_investigator.affiliation.country as country
FROM openalex.awards.wellcome_awards 
LIMIT 10;

In [None]:
%sql
-- Check funder distribution (should all be Wellcome Trust)
SELECT funder.display_name, COUNT(*) as cnt
FROM openalex.awards.wellcome_awards
GROUP BY funder.display_name
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funding_type distribution
SELECT funding_type, COUNT(*) as cnt
FROM openalex.awards.wellcome_awards
GROUP BY funding_type
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funder_scheme distribution (grant programmes)
SELECT funder_scheme, COUNT(*) as cnt, 
       ROUND(SUM(amount)/1e9, 2) as total_amount_billions_gbp
FROM openalex.awards.wellcome_awards
WHERE funder_scheme IS NOT NULL
GROUP BY funder_scheme
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check data completeness
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(description) as has_description,
    COUNT(amount) as has_amount,
    COUNT(start_date) as has_start_date,
    COUNT(end_date) as has_end_date,
    COUNT(lead_investigator) as has_pi,
    COUNT(lead_investigator.family_name) as has_pi_surname,
    ROUND(COUNT(display_name) * 100.0 / COUNT(*), 1) as pct_title,
    ROUND(COUNT(amount) * 100.0 / COUNT(*), 1) as pct_amount,
    ROUND(COUNT(lead_investigator) * 100.0 / COUNT(*), 1) as pct_pi,
    ROUND(SUM(amount)/1e9, 2) as total_amount_billions_gbp
FROM openalex.awards.wellcome_awards;

In [None]:
%sql
-- Check year distribution
SELECT start_year, COUNT(*) as cnt,
       ROUND(SUM(amount)/1e9, 2) as total_amount_billions_gbp
FROM openalex.awards.wellcome_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC
LIMIT 20;

In [None]:
%sql
-- Top recipient institutions
SELECT 
    lead_investigator.affiliation.name as institution,
    lead_investigator.affiliation.country as country,
    COUNT(*) as grant_count,
    ROUND(SUM(amount)/1e6, 1) as total_amount_millions_gbp
FROM openalex.awards.wellcome_awards
WHERE lead_investigator.affiliation.name IS NOT NULL
GROUP BY lead_investigator.affiliation.name, lead_investigator.affiliation.country
ORDER BY total_amount_millions_gbp DESC
LIMIT 25;

In [None]:
%sql
-- Countries by total funding
SELECT 
    lead_investigator.affiliation.country as country,
    COUNT(*) as grant_count,
    ROUND(SUM(amount)/1e9, 2) as total_amount_billions_gbp
FROM openalex.awards.wellcome_awards
WHERE lead_investigator.affiliation.country IS NOT NULL
GROUP BY lead_investigator.affiliation.country
ORDER BY total_amount_billions_gbp DESC
LIMIT 20;