# Create NWO Awards from NWOpen API

Creates NWO awards from the NWOpen API data (Dutch Research Council). ~14.6K projects.

**Prerequisites:**
- Run `scripts/local/nwo_to_s3.py` to download and upload the data first.

**Data source:** https://nwopen-api.nwo.nl/NWOpen-API/api/Projects  
**API docs:** https://data.nwo.nl/en/how-to-use-the-nwopen-api  
**S3 location:** `s3a://openalex-ingest/awards/nwo/nwo_projects.parquet`

**NWO funder:**
- ROR: https://ror.org/04jsz6e67
- DOI: 10.13039/501100003246
- display_name: "Dutch Research Council" (or "Nederlandse Organisatie voor Wetenschappelijk Onderzoek")

**NWO Departments:**
- Sociale en Geesteswetenschappen (Social Sciences and Humanities)
- Exacte en Natuurwetenschappen (Science)
- Toegepaste en Technische Wetenschappen (Applied and Engineering Sciences)
- NWO-breed (NWO-wide)

## Step 1: Create Staging Table from S3

In [None]:
%sql
-- Create the staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.nwo_projects_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as databricks_ingested_at
FROM parquet.`s3a://openalex-ingest/awards/nwo/nwo_projects.parquet`;

In [None]:
%sql
-- Check row count (should be ~14.6K)
SELECT COUNT(*) as total_projects FROM openalex.awards.nwo_projects_raw;

In [None]:
%sql
-- Sample the raw data
SELECT * FROM openalex.awards.nwo_projects_raw LIMIT 5;

## Step 2: Create NWO Awards Table

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.nwo_awards
USING delta
AS
WITH
-- Get NWO funder from OpenAlex by ROR ID
nwo_funder AS (
    SELECT
        funder_id,
        display_name,
        ror_id,
        doi
    FROM openalex.common.funder
    WHERE ror_id = 'https://ror.org/04jsz6e67'  -- NWO (Dutch Research Council)
    LIMIT 1
),

awards_transformed AS (
    SELECT
        -- Generate unique ID: funder_id:project_id
        CONCAT(f.funder_id, ':', LOWER(g.project_id)) as id,

        -- Display name = project title
        g.title as display_name,

        -- Description = abstract (English preferred, fallback to Dutch summary)
        g.abstract as description,

        -- Funder info
        f.funder_id,
        g.project_id as funder_award_id,

        -- NWO API does not provide funding amounts
        CAST(NULL AS DOUBLE) as amount,
        'EUR' as currency,

        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', f.funder_id) as id,
            f.display_name,
            f.ror_id,
            f.doi
        ) as funder,

        -- Funding type - map common NWO schemes to types
        CASE
            WHEN LOWER(g.grant_category) LIKE '%veni%' THEN 'fellowship'
            WHEN LOWER(g.grant_category) LIKE '%vidi%' THEN 'fellowship'
            WHEN LOWER(g.grant_category) LIKE '%vici%' THEN 'fellowship'
            WHEN LOWER(g.grant_category) LIKE '%fellowship%' THEN 'fellowship'
            WHEN LOWER(g.grant_category) LIKE '%promotiebeurs%' THEN 'fellowship'
            WHEN LOWER(g.grant_category) LIKE '%onderzoekstalent%' THEN 'fellowship'
            WHEN LOWER(g.grant_category) LIKE '%graduate%' THEN 'training'
            WHEN LOWER(g.grant_category) LIKE '%spinoza%' THEN 'prize'
            WHEN LOWER(g.grant_category) LIKE '%stevin%' THEN 'prize'
            WHEN LOWER(g.grant_category) LIKE '%gravitation%' THEN 'research'
            WHEN LOWER(g.grant_category) LIKE '%zwaartekracht%' THEN 'research'
            WHEN LOWER(g.grant_category) LIKE '%infrastructuur%' THEN 'infrastructure'
            WHEN LOWER(g.grant_category) LIKE '%infrastructure%' THEN 'infrastructure'
            ELSE 'grant'
        END as funding_type,

        -- Funder scheme = funding_scheme (the specific NWO programme)
        g.grant_category as funder_scheme,

        -- Provenance
        'nwopen' as provenance,

        -- Dates (stored as strings in YYYY-MM-DD format)
        TRY_TO_DATE(g.start_date, 'yyyy-MM-dd') as start_date,
        TRY_TO_DATE(g.end_date, 'yyyy-MM-dd') as end_date,
        YEAR(TRY_TO_DATE(g.start_date, 'yyyy-MM-dd')) as start_year,
        YEAR(TRY_TO_DATE(g.end_date, 'yyyy-MM-dd')) as end_year,

        -- Lead investigator
        CASE
            WHEN g.pi_family_name IS NOT NULL THEN
                struct(
                    g.pi_given_name as given_name,
                    g.pi_family_name as family_name,
                    g.pi_id as orcid,  -- Already contains ORCID URL or NULL
                    CAST(NULL AS DATE) as role_start,
                    struct(
                        g.lead_org_name as name,
                        'Netherlands' as country,
                        CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>) as ids
                    ) as affiliation
                )
            ELSE NULL
        END as lead_investigator,

        -- Co-lead and other investigators (not available in NWO API list response)
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,

        CAST(NULL AS ARRAY<STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >>) as investigators,

        -- Landing page URL (NWO project search)
        CONCAT('https://www.nwo.nl/projecten/', REPLACE(g.project_id, '.', '-')) as landing_page_url,

        -- No DOI for NWO grants typically
        CAST(NULL AS STRING) as doi,

        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date

    FROM openalex.awards.nwo_projects_raw g
    CROSS JOIN nwo_funder f
    WHERE g.project_id IS NOT NULL
      AND TRIM(g.project_id) != ''
)

SELECT * FROM awards_transformed;

## Verification Queries

In [None]:
%sql
-- Check row count (should be ~14.6K)
SELECT COUNT(*) as total_nwo_awards FROM openalex.awards.nwo_awards;

In [None]:
%sql
-- Sample the data
SELECT 
    id,
    display_name,
    funder_award_id,
    funder_scheme,
    funding_type,
    start_date,
    end_date,
    lead_investigator
FROM openalex.awards.nwo_awards 
LIMIT 10;

In [None]:
%sql
-- Check funder distribution (should all be NWO)
SELECT funder.display_name, COUNT(*) as cnt
FROM openalex.awards.nwo_awards
GROUP BY funder.display_name
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funding_type distribution
SELECT funding_type, COUNT(*) as cnt
FROM openalex.awards.nwo_awards
GROUP BY funding_type
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funder_scheme distribution (top 20 funding programmes)
SELECT funder_scheme, COUNT(*) as cnt
FROM openalex.awards.nwo_awards
WHERE funder_scheme IS NOT NULL
GROUP BY funder_scheme
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check data completeness
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(description) as has_abstract,
    COUNT(start_date) as has_start_date,
    COUNT(end_date) as has_end_date,
    COUNT(lead_investigator) as has_pi,
    SUM(CASE WHEN lead_investigator.orcid IS NOT NULL THEN 1 ELSE 0 END) as has_orcid,
    ROUND(COUNT(lead_investigator) * 100.0 / COUNT(*), 1) as pct_with_pi,
    ROUND(COUNT(description) * 100.0 / COUNT(*), 1) as pct_with_abstract
FROM openalex.awards.nwo_awards;

In [None]:
%sql
-- Check year distribution
SELECT start_year, COUNT(*) as cnt
FROM openalex.awards.nwo_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC
LIMIT 20;

In [None]:
%sql
-- Check lead institutions
SELECT 
    lead_investigator.affiliation.name as institution,
    COUNT(*) as cnt
FROM openalex.awards.nwo_awards
WHERE lead_investigator.affiliation.name IS NOT NULL
GROUP BY lead_investigator.affiliation.name
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Sample products_json from raw table (kept for reference, not in awards schema)
SELECT 
    project_id,
    products_json,
    summary_updates_json
FROM openalex.awards.nwo_projects_raw
WHERE products_json IS NOT NULL
LIMIT 3;