# Create NIH Awards from NIH ExPORTER

Creates NIH awards from NIH ExPORTER data (FY1985-2024). ~2.28M unique awards.

**Prerequisites:**
- Run `scripts/local/nih_exporter_to_s3.py` to download and upload the data first.

**Data source:** https://reporter.nih.gov/exporter  
**S3 location:** `s3a://openalex-ingest/awards/nih/nih_projects_combined.parquet`

**NIH funder in OpenAlex:**
- funder_id: 4320332161
- display_name: "National Institutes of Health"
- ror_id: "https://ror.org/01cwqze88"
- doi: "10.13039/100000002"

## Step 1: Create Staging Table

In [None]:
%sql
-- Create the staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.nih_grants_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as ingested_at
FROM parquet.`s3a://openalex-ingest/awards/nih/nih_projects_combined.parquet`;

In [None]:
%sql
-- Check row count (should be ~2.28M)
SELECT COUNT(*) as total_grants FROM openalex.awards.nih_grants_raw;

## Step 2: Create NIH Awards Table

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.nih_awards
USING delta
AS
WITH nih_funder AS (
    -- NIH funder from OpenAlex (confirmed funder_id = 4320332161)
    SELECT
        funder_id,
        display_name,
        ror_id,
        doi
    FROM openalex.common.funder
    WHERE funder_id = 4320332161  -- National Institutes of Health
),

-- Parse PI names (format: "LASTNAME, FIRSTNAME (contact); LASTNAME2, FIRSTNAME2")
parsed_grants AS (
    SELECT
        r.*,
        -- Extract first PI as lead investigator (before any semicolon)
        CASE
            WHEN r.pi_names IS NOT NULL AND r.pi_names != ''
            THEN TRIM(split(r.pi_names, ';')[0])
            ELSE NULL
        END as first_pi_name
    FROM openalex.awards.nih_grants_raw r
),

awards_transformed AS (
    SELECT
        -- Generate unique ID using xxhash64 of funder + award_id
        abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(g.full_project_num)))) % 9000000000 as id,

        -- Display name = project title
        g.project_title as display_name,

        -- Description = PHR (Public Health Relevance statement)
        g.phr as description,

        -- Funder info
        f.funder_id,
        LOWER(g.full_project_num) as funder_award_id,

        -- Funding amount (total cost for this fiscal year)
        CAST(g.total_cost AS DOUBLE) as amount,
        'USD' as currency,

        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', f.funder_id) as id,
            f.display_name,
            f.ror_id,
            f.doi
        ) as funder,

        -- Funding type based on activity code
        CASE
            WHEN g.activity LIKE 'R%' THEN 'research'
            WHEN g.activity LIKE 'K%' THEN 'career_development'
            WHEN g.activity LIKE 'T%' THEN 'training'
            WHEN g.activity LIKE 'F%' THEN 'fellowship'
            WHEN g.activity LIKE 'P%' THEN 'program'
            WHEN g.activity LIKE 'U%' THEN 'cooperative_agreement'
            WHEN g.activity LIKE 'N%' THEN 'contract'
            WHEN g.activity LIKE 'Z%' THEN 'intramural'
            ELSE 'grant'
        END as funding_type,

        -- Funder scheme = activity code (R01, K08, etc)
        g.activity as funder_scheme,

        -- Provenance
        'nih_exporter' as provenance,

        -- Dates (format varies, try multiple patterns)
        COALESCE(
            TRY_TO_DATE(g.project_start, 'yyyy-MM-dd'),
            TRY_TO_DATE(g.project_start, 'MM/dd/yyyy'),
            TRY_TO_DATE(g.project_start, 'M/d/yyyy')
        ) as start_date,
        COALESCE(
            TRY_TO_DATE(g.project_end, 'yyyy-MM-dd'),
            TRY_TO_DATE(g.project_end, 'MM/dd/yyyy'),
            TRY_TO_DATE(g.project_end, 'M/d/yyyy')
        ) as end_date,
        YEAR(COALESCE(
            TRY_TO_DATE(g.project_start, 'yyyy-MM-dd'),
            TRY_TO_DATE(g.project_start, 'MM/dd/yyyy'),
            TRY_TO_DATE(g.project_start, 'M/d/yyyy')
        )) as start_year,
        YEAR(COALESCE(
            TRY_TO_DATE(g.project_end, 'yyyy-MM-dd'),
            TRY_TO_DATE(g.project_end, 'MM/dd/yyyy'),
            TRY_TO_DATE(g.project_end, 'M/d/yyyy')
        )) as end_year,

        -- Lead investigator (parse "LASTNAME, FIRSTNAME" or "LASTNAME, FIRSTNAME (contact)")
        CASE
            WHEN g.first_pi_name IS NOT NULL THEN
                struct(
                    -- Extract given name (after comma, before any parenthetical)
                    CASE
                        WHEN CONTAINS(g.first_pi_name, ',')
                        THEN TRIM(REGEXP_REPLACE(split(g.first_pi_name, ',')[1], '\\s*\\(.*\\)\\s*', ''))
                        ELSE NULL
                    END as given_name,
                    -- Extract family name (before comma)
                    CASE
                        WHEN CONTAINS(g.first_pi_name, ',')
                        THEN TRIM(split(g.first_pi_name, ',')[0])
                        ELSE TRIM(g.first_pi_name)
                    END as family_name,
                    CAST(NULL AS STRING) as orcid,
                    CAST(NULL AS DATE) as role_start,
                    struct(
                        g.org_name as name,
                        g.org_country as country,
                        CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>) as ids
                    ) as affiliation
                )
            ELSE NULL
        END as lead_investigator,

        -- Co-lead and other investigators (NULL for now, could be enriched from pi_names)
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,

        CAST(NULL AS ARRAY<STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >>) as investigators,

        -- Landing page URL (NIH RePORTER project details page)
        CONCAT('https://reporter.nih.gov/project-details/', g.application_id) as landing_page_url,

        -- No DOI for NIH grants typically
        CAST(NULL AS STRING) as doi,

        -- Works API URL
        concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(g.full_project_num)))) % 9000000000) as works_api_url,

        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date

    FROM parsed_grants g
    CROSS JOIN nih_funder f
    WHERE g.full_project_num IS NOT NULL
      AND TRIM(g.full_project_num) != ''
)

SELECT * FROM awards_transformed;

In [None]:
%sql
-- Insert into openalex_awards
INSERT INTO openalex.awards.openalex_awards
SELECT
    id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    works_api_url,
    created_date,
    updated_date,
    ARRAY() as funded_outputs,
    0 as funded_outputs_count
FROM openalex.awards.nih_awards;

## Verification Queries

In [None]:
%sql
-- Check row count (should be ~2.28M)
SELECT COUNT(*) as total_nih_awards FROM openalex.awards.nih_awards;

In [None]:
%sql
-- Sample the data
SELECT * FROM openalex.awards.nih_awards LIMIT 10;

In [None]:
%sql
-- Check funding type distribution
SELECT funding_type, COUNT(*) as cnt
FROM openalex.awards.nih_awards
GROUP BY funding_type
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funder_scheme (activity code) distribution
SELECT funder_scheme, COUNT(*) as cnt
FROM openalex.awards.nih_awards
GROUP BY funder_scheme
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check date parsing success rate
SELECT
    COUNT(*) as total,
    COUNT(start_date) as has_start_date,
    COUNT(end_date) as has_end_date,
    ROUND(COUNT(start_date) * 100.0 / COUNT(*), 1) as pct_with_start_date
FROM openalex.awards.nih_awards;