# Create NHMRC Awards

Creates awards from the Australian National Health and Medical Research Council (NHMRC) grant outcomes data.

**Prerequisites:**
- Run `scripts/local/nhmrc_to_s3.py` to download and upload the data first.

**Data source:** https://www.nhmrc.gov.au/funding/data-research/outcomes  
**S3 location:** `s3a://openalex-ingest/awards/nhmrc/nhmrc_projects.parquet`

**NHMRC funder:**
- funder_id: 4320334705
- display_name: "National Health and Medical Research Council"
- ROR: https://ror.org/011kf5r70
- DOI: 10.13039/501100000925

**Notes:**
- NHMRC grants include Chief Investigator A (CIA) as the lead PI
- Grants are administered through Australian universities and research institutes
- Currency is Australian Dollars (AUD)
- Grant types include Investigator Grants, Ideas Grants, Synergy Grants, etc.

## Step 1: Create Staging Table from S3

In [None]:
%sql
-- Create the staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.nhmrc_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as databricks_ingested_at
FROM parquet.`s3a://openalex-ingest/awards/nhmrc/nhmrc_projects.parquet`;

In [None]:
%sql
-- Check row count
SELECT COUNT(*) as total_grants FROM openalex.awards.nhmrc_raw;

In [None]:
%sql
-- Check column names and sample data
SELECT * FROM openalex.awards.nhmrc_raw LIMIT 5;

In [None]:
%sql
-- Check available columns
DESCRIBE openalex.awards.nhmrc_raw;

## Step 2: Create NHMRC Awards Table

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.nhmrc_awards
USING delta
AS
WITH
-- Get NHMRC funder from OpenAlex by explicit funder_id
nhmrc_funder AS (
    SELECT
        funder_id,
        display_name,
        ror_id,
        doi
    FROM openalex.common.funder
    WHERE funder_id = 4320334705  -- National Health and Medical Research Council
),

-- Parse CIA name into given/family names
-- NHMRC format is typically "Family, Given" or "Dr Given Family"
cia_parsed AS (
    SELECT
        *,
        CASE
            WHEN cia_name LIKE '%,%' THEN TRIM(SUBSTRING_INDEX(cia_name, ',', -1))
            WHEN cia_name LIKE '% %' THEN TRIM(SUBSTRING_INDEX(TRIM(REGEXP_REPLACE(cia_name, '^(Dr|Prof|Professor|Associate Professor|A/Prof|Assoc Prof|Mr|Ms|Mrs)\\s+', '')), ' ', 1))
            ELSE NULL
        END as cia_given_name,
        CASE
            WHEN cia_name LIKE '%,%' THEN TRIM(SUBSTRING_INDEX(cia_name, ',', 1))
            WHEN cia_name LIKE '% %' THEN TRIM(SUBSTRING_INDEX(TRIM(REGEXP_REPLACE(cia_name, '^(Dr|Prof|Professor|Associate Professor|A/Prof|Assoc Prof|Mr|Ms|Mrs)\\s+', '')), ' ', -1))
            ELSE cia_name
        END as cia_family_name
    FROM openalex.awards.nhmrc_raw
),

awards_transformed AS (
    SELECT
        -- Generate unique ID using xxhash64 of funder_id:grant_id or app_id
        abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(COALESCE(g.grant_id, g.app_id))))) % 9000000000 as id,

        -- Display name = grant title
        COALESCE(g.grant_title, g.simplified_title) as display_name,

        -- Description from plain_description field
        g.plain_description as description,

        -- Funder info
        f.funder_id,
        COALESCE(g.grant_id, g.app_id) as funder_award_id,

        -- Amount in AUD - use TRY_CAST for safety
        TRY_CAST(g.total AS DOUBLE) as amount,
        'AUD' as currency,

        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', f.funder_id) as id,
            f.display_name,
            f.ror_id,
            f.doi
        ) as funder,

        -- Funding type - map NHMRC grant types
        CASE
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%investigator%' THEN 'research'
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%ideas%' THEN 'research'
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%synergy%' THEN 'research'
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%fellowship%' THEN 'fellowship'
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%scholarship%' THEN 'fellowship'
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%postgraduate%' THEN 'fellowship'
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%clinical trial%' THEN 'research'
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%cohort%' THEN 'research'
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%project%' THEN 'research'
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%program%' THEN 'research'
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%centre%' THEN 'infrastructure'
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%equipment%' THEN 'infrastructure'
            WHEN LOWER(COALESCE(g.grant_type, g.grant_sub_type)) LIKE '%development%' THEN 'research'
            ELSE 'grant'
        END as funding_type,

        -- Funder scheme = grant type + sub type
        CONCAT_WS(' - ', g.grant_type, g.grant_sub_type) as funder_scheme,

        -- Provenance
        'nhmrc' as provenance,

        -- Dates - use start_date/end_date if available, else derive from start_year/end_year
        COALESCE(
            TRY_TO_DATE(g.start_date, 'yyyy-MM-dd HH:mm:ss'),
            TRY_TO_DATE(g.start_date, 'yyyy-MM-dd'),
            CASE WHEN TRY_CAST(g.start_year AS INT) IS NOT NULL 
                 THEN TO_DATE(CONCAT(g.start_year, '-01-01')) 
                 ELSE NULL END
        ) as start_date,
        COALESCE(
            TRY_TO_DATE(g.end_date, 'yyyy-MM-dd HH:mm:ss'),
            TRY_TO_DATE(g.end_date, 'yyyy-MM-dd'),
            CASE WHEN TRY_CAST(g.end_year AS INT) IS NOT NULL 
                 THEN TO_DATE(CONCAT(g.end_year, '-12-31')) 
                 ELSE NULL END
        ) as end_date,
        COALESCE(
            TRY_CAST(g.start_year AS INT),
            YEAR(TRY_TO_DATE(g.start_date, 'yyyy-MM-dd HH:mm:ss')),
            YEAR(TRY_TO_DATE(g.start_date, 'yyyy-MM-dd'))
        ) as start_year,
        COALESCE(
            TRY_CAST(g.end_year AS INT),
            YEAR(TRY_TO_DATE(g.end_date, 'yyyy-MM-dd HH:mm:ss')),
            YEAR(TRY_TO_DATE(g.end_date, 'yyyy-MM-dd'))
        ) as end_year,

        -- Lead investigator (CIA) - now includes ORCID from 2025 data
        CASE
            WHEN g.cia_name IS NOT NULL AND TRIM(g.cia_name) != '' THEN
                struct(
                    g.cia_given_name as given_name,
                    g.cia_family_name as family_name,
                    g.cia_orcid as orcid,
                    CAST(NULL AS DATE) as role_start,
                    struct(
                        g.administering_institution as name,
                        'Australia' as country,
                        -- Include ROR ID if available (from 2025 data)
                        CASE 
                            WHEN g.administering_institution_ror_id IS NOT NULL 
                            THEN ARRAY(
                                struct(
                                    g.administering_institution_ror_id as id, 
                                    'ror' as type, 
                                    'nhmrc' as asserted_by
                                )
                            )
                            ELSE CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>)
                        END as ids
                    ) as affiliation
                )
            ELSE NULL
        END as lead_investigator,

        -- Co-lead and other investigators (not available in public data)
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,

        CAST(NULL AS ARRAY<STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >>) as investigators,

        -- Landing page URL
        CAST(NULL AS STRING) as landing_page_url,

        -- No DOI for NHMRC grants
        CAST(NULL AS STRING) as doi,

        -- Works API URL
        concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(COALESCE(g.grant_id, g.app_id))))) % 9000000000) as works_api_url,

        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date

    FROM cia_parsed g
    CROSS JOIN nhmrc_funder f
    WHERE COALESCE(g.grant_id, g.app_id) IS NOT NULL
      AND TRIM(COALESCE(g.grant_id, g.app_id)) != ''
)

SELECT * FROM awards_transformed;

In [None]:
%sql
-- Insert into openalex_awards_raw with priority
INSERT INTO openalex.awards.openalex_awards_raw
SELECT
    id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    works_api_url,
    created_date,
    updated_date,
    12 as priority
FROM openalex.awards.nhmrc_awards;

## Verification Queries

In [None]:
%sql
-- Check row count
SELECT COUNT(*) as total_nhmrc_awards FROM openalex.awards.nhmrc_awards;

In [None]:
%sql
-- Sample the data
SELECT 
    id,
    display_name,
    funder_award_id,
    funder_scheme,
    funding_type,
    amount,
    currency,
    start_date,
    end_date,
    lead_investigator.given_name as pi_given,
    lead_investigator.family_name as pi_family,
    lead_investigator.affiliation.name as institution
FROM openalex.awards.nhmrc_awards 
LIMIT 10;

In [None]:
%sql
-- Check funder distribution (should all be NHMRC)
SELECT funder.display_name, COUNT(*) as cnt
FROM openalex.awards.nhmrc_awards
GROUP BY funder.display_name
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funding_type distribution
SELECT funding_type, COUNT(*) as cnt
FROM openalex.awards.nhmrc_awards
GROUP BY funding_type
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funder_scheme distribution (grant types)
SELECT funder_scheme, COUNT(*) as cnt, 
       ROUND(SUM(amount)/1e9, 2) as total_amount_billions_aud
FROM openalex.awards.nhmrc_awards
WHERE funder_scheme IS NOT NULL
GROUP BY funder_scheme
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check data completeness
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(amount) as has_amount,
    COUNT(start_date) as has_start_date,
    COUNT(end_date) as has_end_date,
    COUNT(lead_investigator) as has_pi,
    ROUND(COUNT(display_name) * 100.0 / COUNT(*), 1) as pct_title,
    ROUND(COUNT(amount) * 100.0 / COUNT(*), 1) as pct_amount,
    ROUND(COUNT(lead_investigator) * 100.0 / COUNT(*), 1) as pct_pi,
    ROUND(SUM(amount)/1e9, 2) as total_amount_billions_aud
FROM openalex.awards.nhmrc_awards;

In [None]:
%sql
-- Check year distribution
SELECT start_year, COUNT(*) as cnt,
       ROUND(SUM(amount)/1e9, 2) as total_amount_billions_aud
FROM openalex.awards.nhmrc_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC
LIMIT 20;

In [None]:
%sql
-- Top administering institutions
SELECT 
    lead_investigator.affiliation.name as institution,
    COUNT(*) as grant_count,
    ROUND(SUM(amount)/1e6, 1) as total_amount_millions_aud
FROM openalex.awards.nhmrc_awards
WHERE lead_investigator.affiliation.name IS NOT NULL
GROUP BY lead_investigator.affiliation.name
ORDER BY total_amount_millions_aud DESC
LIMIT 25;