# Create Independent Research Fund Denmark Awards

Creates Independent Research Fund Denmark (DFF) awards from Research Portal Denmark. ~4K projects.

**Prerequisites:**
- Data downloaded via Playwright browser automation from forskningsportal.dk
- Run upload script to push data to S3

**Data source:** https://grants.forskningsportal.dk/  
**S3 location:** `s3a://openalex-ingest/awards/independent_research_fund_denmark/independent_research_fund_denmark.jsonl`

**Independent Research Fund Denmark funder:**
- funder_id: 4320322928
- ROR: https://ror.org/02sptwz63
- DOI: 10.13039/501100004836
- display_name: "Danmarks Frie Forskningsfond"

## Step 1: Create Staging Table from S3

In [None]:
%sql
-- Create the staging table from S3 JSON lines
CREATE OR REPLACE TABLE openalex.awards.independent_research_fund_denmark_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as databricks_ingested_at
FROM json.`s3a://openalex-ingest/awards/independent_research_fund_denmark/independent_research_fund_denmark.jsonl`;

In [None]:
%sql
-- Check row count (should be ~4K)
SELECT COUNT(*) as total_projects FROM openalex.awards.independent_research_fund_denmark_raw;

In [None]:
%sql
-- Inspect column names and sample data
DESCRIBE openalex.awards.independent_research_fund_denmark_raw;

In [None]:
%sql
-- Sample the raw data
SELECT * FROM openalex.awards.independent_research_fund_denmark_raw LIMIT 5;

## Step 2: Create Independent Research Fund Denmark Awards Table

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.independent_research_fund_denmark_awards
USING delta
AS
WITH
-- Get Independent Research Fund Denmark funder from OpenAlex
dff_funder AS (
    SELECT
        funder_id,
        display_name,
        ror_id,
        doi
    FROM openalex.common.funder
    WHERE funder_id = 4320322928  -- Independent Research Fund Denmark
),

awards_transformed AS (
    SELECT
        -- Generate unique ID using xxhash64 of funder_id:grant_id
        abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(CAST(g.`Grant Id` AS STRING))))) % 9000000000 as id,

        -- Display name = project title
        TRIM(g.`Title`) as display_name,

        -- Description = abstract
        TRIM(g.`Abstract`) as description,

        -- Funder info
        f.funder_id,
        g.`Grant Id` as funder_award_id,

        -- Amount in DKK
        TRY_CAST(g.`Amount Granted` AS DOUBLE) as amount,
        'DKK' as currency,

        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', f.funder_id) as id,
            f.display_name,
            f.ror_id,
            f.doi
        ) as funder,

        -- Funding type - map Danish types
        CASE
            WHEN LOWER(g.`Funding Type`) LIKE '%fellowship%' THEN 'fellowship'
            WHEN LOWER(g.`Funding Type`) LIKE '%postdoctoral%' THEN 'fellowship'
            WHEN LOWER(g.`Funding Type`) LIKE '%sapere aude%' THEN 'fellowship'
            WHEN LOWER(g.`Funding Type`) LIKE '%individual%' THEN 'grant'
            WHEN LOWER(g.`Funding Type`) LIKE '%research project%' THEN 'grant'
            ELSE 'grant'
        END as funding_type,

        -- Funder scheme = funding instrument/call
        COALESCE(g.`Funder Specific Instrument`, g.`Call`) as funder_scheme,

        -- Provenance
        'forskningsportal_dk' as provenance,

        -- Dates
        TRY_TO_DATE(g.`Grant Start Date`, 'yyyy-MM-dd') as start_date,
        TRY_TO_DATE(g.`Grant End Date`, 'yyyy-MM-dd') as end_date,
        TRY_CAST(g.`Grant Year` AS INT) as start_year,
        TRY_CAST(YEAR(TRY_TO_DATE(g.`Grant End Date`, 'yyyy-MM-dd')) AS INT) as end_year,

        -- Lead investigator
        CASE
            WHEN g.`Last name` IS NOT NULL AND g.`Person role` = 'Principal investigator' THEN
                struct(
                    g.`First name` as given_name,
                    g.`Last name` as family_name,
                    NULLIF(TRIM(g.`ORCID`), '') as orcid,
                    CAST(NULL AS DATE) as role_start,
                    struct(
                        g.`Organisation name` as name,
                        g.`Organisation country` as country,
                        CASE
                            WHEN g.`Organisation ROR` IS NOT NULL THEN
                                array(struct(
                                    g.`Organisation ROR` as id,
                                    'ror' as type,
                                    'funder' as asserted_by
                                ))
                            ELSE CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>)
                        END as ids
                    ) as affiliation
                )
            ELSE NULL
        END as lead_investigator,

        -- Co-lead and other investigators (not in this dataset)
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,

        CAST(NULL AS ARRAY<STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >>) as investigators,

        -- Landing page URL
        g.`Project URL` as landing_page_url,

        -- DOI for grant
        NULLIF(TRIM(g.`Grant DOI`), '') as doi,

        -- Works API URL
        concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(CAST(g.`Grant Id` AS STRING))))) % 9000000000) as works_api_url,

        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date

    FROM openalex.awards.independent_research_fund_denmark_raw g
    CROSS JOIN dff_funder f
    WHERE g.`Grant Id` IS NOT NULL
      AND g.`Person role` = 'Principal investigator'  -- Only get PI rows to avoid duplicates
)

SELECT * FROM awards_transformed;

In [None]:
%sql
-- Remove previous data for this source before inserting fresh data
DELETE FROM openalex.awards.openalex_awards_raw
WHERE provenance = 'forskningsportal_dk' AND priority = 30;

-- Insert into openalex_awards_raw with priority 30
INSERT INTO openalex.awards.openalex_awards_raw
SELECT
    id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    works_api_url,
    created_date,
    updated_date,
    30 as priority
FROM openalex.awards.independent_research_fund_denmark_awards;

## Verification Queries

In [None]:
%sql
-- Check row count (should be ~4K)
SELECT COUNT(*) as total_awards FROM openalex.awards.independent_research_fund_denmark_awards;

In [None]:
%sql
-- Sample the data
SELECT 
    id,
    display_name,
    funder_award_id,
    funder_scheme,
    funding_type,
    amount,
    start_year,
    end_year,
    lead_investigator
FROM openalex.awards.independent_research_fund_denmark_awards 
LIMIT 10;

In [None]:
%sql
-- Check funding_type distribution
SELECT funding_type, COUNT(*) as cnt
FROM openalex.awards.independent_research_fund_denmark_awards
GROUP BY funding_type
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funder_scheme distribution (top 20)
SELECT funder_scheme, COUNT(*) as cnt
FROM openalex.awards.independent_research_fund_denmark_awards
WHERE funder_scheme IS NOT NULL
GROUP BY funder_scheme
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check data completeness
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(description) as has_abstract,
    COUNT(amount) as has_amount,
    COUNT(start_date) as has_start_date,
    COUNT(end_date) as has_end_date,
    COUNT(lead_investigator) as has_pi,
    SUM(CASE WHEN lead_investigator.orcid IS NOT NULL THEN 1 ELSE 0 END) as has_orcid,
    ROUND(COUNT(lead_investigator) * 100.0 / COUNT(*), 1) as pct_with_pi,
    ROUND(COUNT(description) * 100.0 / COUNT(*), 1) as pct_with_abstract,
    ROUND(SUM(CASE WHEN lead_investigator.orcid IS NOT NULL THEN 1 ELSE 0 END) * 100.0 / COUNT(*), 1) as pct_with_orcid,
    ROUND(SUM(amount)/1000000, 0) as total_funding_millions_dkk
FROM openalex.awards.independent_research_fund_denmark_awards;

In [None]:
%sql
-- Check year distribution
SELECT start_year, COUNT(*) as cnt, ROUND(SUM(amount)/1000000, 1) as funding_millions_dkk
FROM openalex.awards.independent_research_fund_denmark_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC
LIMIT 15;

In [None]:
%sql
-- Check lead institutions (top 20)
SELECT 
    lead_investigator.affiliation.name as institution,
    COUNT(*) as cnt
FROM openalex.awards.independent_research_fund_denmark_awards
WHERE lead_investigator.affiliation.name IS NOT NULL
GROUP BY lead_investigator.affiliation.name
ORDER BY cnt DESC
LIMIT 20;