# Create NSERC Awards from Canada Open Data

Creates NSERC (Natural Sciences and Engineering Research Council of Canada) awards from
Canada's Open Government Portal data (FY1991-2024).

**Prerequisites:**
- Run `scripts/local/nserc_to_s3.py` to download and upload the data first.

**Data source:** https://open.canada.ca/data/en/dataset/c1b0f627-8c29-427c-ab73-33968ad9176e  
**S3 location:** `s3a://openalex-ingest/awards/nserc/`

**NSERC funder in OpenAlex:**
- funder_id: 4320334593
- display_name: "Natural Sciences and Engineering Research Council of Canada"
- ror_id: "https://ror.org/01h531d29"
- doi: "10.13039/501100000038"

## Step 1: Create Staging Tables

In [None]:
%sql
-- Create the awards staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.nserc_awards_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as ingested_at
FROM parquet.`s3a://openalex-ingest/awards/nserc/nserc_awards.parquet`;

In [None]:
%sql
-- Create the co-applicants staging table
CREATE OR REPLACE TABLE openalex.awards.nserc_coapplicants_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as ingested_at
FROM parquet.`s3a://openalex-ingest/awards/nserc/nserc_coapplicants.parquet`;

In [None]:
%sql
-- Create the partners staging table
CREATE OR REPLACE TABLE openalex.awards.nserc_partners_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as ingested_at
FROM parquet.`s3a://openalex-ingest/awards/nserc/nserc_partners.parquet`;

In [None]:
%sql
-- Check row counts
SELECT 'awards' as table_name, COUNT(*) as row_count FROM openalex.awards.nserc_awards_raw
UNION ALL
SELECT 'coapplicants', COUNT(*) FROM openalex.awards.nserc_coapplicants_raw
UNION ALL
SELECT 'partners', COUNT(*) FROM openalex.awards.nserc_partners_raw;

In [None]:
%sql
-- Preview the awards data
SELECT * FROM openalex.awards.nserc_awards_raw LIMIT 5;

In [None]:
%sql
-- Preview the co-applicants data (note: column names differ from awards)
-- Co-applicants use: coapplicantname_nomcoapplicant, coappinstitution_etablissement, countryen, etc.
SELECT * FROM openalex.awards.nserc_coapplicants_raw LIMIT 5;

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.nserc_awards
USING delta
AS
WITH nserc_funder AS (
    -- NSERC funder from OpenAlex
    SELECT
        funder_id,
        display_name,
        ror_id,
        doi
    FROM openalex.common.funder
    WHERE funder_id = 4320334593  -- Natural Sciences and Engineering Research Council of Canada
),

-- Aggregate co-applicants per award
-- Co-applicants columns: applicationid, coapplicantname_nomcoapplicant, coapporganizationid,
--                        coappinstitution_etablissement, provinceen, provincefr, countryen, countryfr
coapplicants_agg AS (
    SELECT
        applicationid,
        COLLECT_LIST(
            struct(
                -- Parse name (format: "LastName, FirstName")
                CASE
                    WHEN CONTAINS(coapplicantname_nomcoapplicant, ',')
                    THEN TRIM(split(coapplicantname_nomcoapplicant, ',')[1])
                    ELSE NULL
                END as given_name,
                CASE
                    WHEN CONTAINS(coapplicantname_nomcoapplicant, ',')
                    THEN TRIM(split(coapplicantname_nomcoapplicant, ',')[0])
                    ELSE TRIM(coapplicantname_nomcoapplicant)
                END as family_name,
                CAST(NULL AS STRING) as orcid,
                CAST(NULL AS DATE) as role_start,
                struct(
                    coappinstitution_etablissement as name,
                    countryen as country,
                    CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>) as ids
                ) as affiliation
            )
        ) as investigators
    FROM openalex.awards.nserc_coapplicants_raw
    WHERE applicationid IS NOT NULL
    GROUP BY applicationid
),

awards_transformed AS (
    SELECT
        -- Generate unique ID using xxhash64 of funder_id:applicationid
        abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(TRIM(g.applicationid))))) % 9000000000 as id,

        -- Display name = project title
        g.applicationtitle as display_name,

        -- Description = summary
        g.applicationsummary as description,

        -- Funder info
        f.funder_id,
        LOWER(TRIM(g.applicationid)) as funder_award_id,

        -- Funding amount
        CAST(REGEXP_REPLACE(g.awardamount, '[^0-9.]', '') AS DOUBLE) as amount,
        'CAD' as currency,

        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', f.funder_id) as id,
            f.display_name,
            f.ror_id,
            f.doi
        ) as funder,

        -- Funding type based on group
        CASE
            WHEN LOWER(g.groupen) LIKE '%discovery%' THEN 'research'
            WHEN LOWER(g.groupen) LIKE '%fellowship%' THEN 'fellowship'
            WHEN LOWER(g.groupen) LIKE '%scholarship%' THEN 'fellowship'
            WHEN LOWER(g.groupen) LIKE '%research partnership%' THEN 'research'
            WHEN LOWER(g.groupen) LIKE '%training%' THEN 'training'
            WHEN LOWER(g.groupen) LIKE '%equipment%' THEN 'equipment'
            WHEN LOWER(g.groupen) LIKE '%supplement%' THEN 'supplement'
            ELSE 'grant'
        END as funding_type,

        -- Funder scheme = program name (note: source has typo "ProgramNane")
        COALESCE(g.programnameen, g.programnaneen) as funder_scheme,

        -- Provenance
        'nserc_open_data' as provenance,

        -- Dates - NSERC only provides fiscal year, so use April 1 start
        -- Canadian fiscal year runs April 1 to March 31
        CASE
            WHEN g.`fiscalyear_exercice_financier` IS NOT NULL
            THEN TRY_TO_DATE(CONCAT(g.`fiscalyear_exercice_financier`, '-04-01'))
            ELSE NULL
        END as start_date,
        CASE
            WHEN g.`fiscalyear_exercice_financier` IS NOT NULL
            THEN TRY_TO_DATE(CONCAT(CAST(CAST(g.`fiscalyear_exercice_financier` AS INT) + 1 AS STRING), '-03-31'))
            ELSE NULL
        END as end_date,
        CAST(g.`fiscalyear_exercice_financier` AS INT) as start_year,
        CAST(g.`fiscalyear_exercice_financier` AS INT) as end_year,

        -- Lead investigator (parse "LastName, FirstName" format)
        CASE
            WHEN g.name_nom IS NOT NULL AND TRIM(g.name_nom) != '' THEN
                struct(
                    -- Extract given name (after comma)
                    CASE
                        WHEN CONTAINS(g.name_nom, ',')
                        THEN TRIM(split(g.name_nom, ',')[1])
                        ELSE NULL
                    END as given_name,
                    -- Extract family name (before comma)
                    CASE
                        WHEN CONTAINS(g.name_nom, ',')
                        THEN TRIM(split(g.name_nom, ',')[0])
                        ELSE TRIM(g.name_nom)
                    END as family_name,
                    CAST(NULL AS STRING) as orcid,
                    CAST(NULL AS DATE) as role_start,
                    struct(
                        g.institution_etablissement as name,
                        g.countryen as country,
                        CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>) as ids
                    ) as affiliation
                )
            ELSE NULL
        END as lead_investigator,

        -- Co-lead investigator (NULL for NSERC)
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,

        -- Investigators from co-applicants
        c.investigators,

        -- Landing page URL (NSERC Awards Database search)
        CONCAT('https://www.nserc-crsng.gc.ca/ase-oro/Details-Detailles_eng.asp?id=', g.applicationid) as landing_page_url,

        -- No DOI for NSERC grants
        CAST(NULL AS STRING) as doi,

        -- Works API URL
        concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(TRIM(g.applicationid))))) % 9000000000) as works_api_url,

        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date

    FROM openalex.awards.nserc_awards_raw g
    CROSS JOIN nserc_funder f
    LEFT JOIN coapplicants_agg c ON g.applicationid = c.applicationid
    WHERE g.applicationid IS NOT NULL
      AND TRIM(g.applicationid) != ''
      AND TRIM(g.applicationid) != 'NA'
)

SELECT * FROM awards_transformed;

## Verification Queries

In [None]:
%sql
-- Check row count
SELECT COUNT(*) as total_nserc_awards FROM openalex.awards.nserc_awards;

In [None]:
%sql
-- Sample the data
SELECT * FROM openalex.awards.nserc_awards LIMIT 10;

In [None]:
%sql
-- Check funding type distribution
SELECT funding_type, COUNT(*) as cnt
FROM openalex.awards.nserc_awards
GROUP BY funding_type
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funder_scheme (program) distribution
SELECT funder_scheme, COUNT(*) as cnt
FROM openalex.awards.nserc_awards
GROUP BY funder_scheme
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check data completeness
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(description) as has_description,
    COUNT(amount) as has_amount,
    COUNT(start_date) as has_start_date,
    COUNT(lead_investigator) as has_pi,
    ROUND(COUNT(display_name) * 100.0 / COUNT(*), 1) as pct_title,
    ROUND(COUNT(description) * 100.0 / COUNT(*), 1) as pct_description,
    ROUND(COUNT(start_date) * 100.0 / COUNT(*), 1) as pct_dates
FROM openalex.awards.nserc_awards;

In [None]:
%sql
-- Check awards by year
SELECT start_year, COUNT(*) as cnt
FROM openalex.awards.nserc_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC
LIMIT 20;

In [None]:
%sql
-- Check top institutions by award count
SELECT lead_investigator.affiliation.name as institution, COUNT(*) as cnt
FROM openalex.awards.nserc_awards
WHERE lead_investigator.affiliation.name IS NOT NULL
GROUP BY lead_investigator.affiliation.name
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check awards with co-applicants
SELECT
    COUNT(*) as total,
    COUNT(CASE WHEN investigators IS NOT NULL AND SIZE(investigators) > 0 THEN 1 END) as has_coinvestigators,
    ROUND(COUNT(CASE WHEN investigators IS NOT NULL AND SIZE(investigators) > 0 THEN 1 END) * 100.0 / COUNT(*), 1) as pct_with_coinvestigators
FROM openalex.awards.nserc_awards;

In [None]:
%sql
-- Summary statistics
SELECT
    COUNT(*) as total_awards,
    COUNT(DISTINCT funder_award_id) as unique_award_ids,
    MIN(start_year) as earliest_year,
    MAX(start_year) as latest_year,
    SUM(amount) as total_funding_cad,
    AVG(amount) as avg_award_amount,
    PERCENTILE(amount, 0.5) as median_award_amount
FROM openalex.awards.nserc_awards;