# Create ERC Awards

Creates awards from the European Research Council (ERC) grants data from CORDIS.

**Prerequisites:**
- Run `scripts/local/erc_to_s3.py` to download and upload the data first.

**Data sources:**
- Horizon Europe (2021-2027): CORDIS
- Horizon 2020 (2014-2020): CORDIS
- FP7 (2007-2013): CORDIS (optional, retry if failed)

**S3 location:** `s3a://openalex-ingest/awards/erc/erc_projects.parquet`

**ERC funder (main):**
- funder_id: 4320334678
- display_name: "European Research Council"
- ROR: https://ror.org/0472cxd90
- DOI: 10.13039/501100000781

**Sub-funders (by framework programme):**
- H2020 ERC: 4320338335 (10.13039/100010663)
- FP7 Ideas ERC: 4320338352 (10.13039/100011199)
- Horizon Europe ERC: 4320338453 (10.13039/100019180)

**ERC Grant Types:**
- ERC-STG / HORIZON-ERC: Starting Grants (early-career researchers)
- ERC-COG: Consolidator Grants (mid-career researchers)
- ERC-ADG: Advanced Grants (established researchers)
- ERC-SyG / HORIZON-ERC-SYG: Synergy Grants (collaborative teams)
- ERC-POC / HORIZON-ERC-POC: Proof of Concept (commercialization)

**Notes:**
- Currency is EUR (Euros)
- EC contribution is the actual ERC funding amount
- Host institution is the coordinator organization

## Step 1: Create Staging Table from S3

In [None]:
%sql
-- Create the staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.erc_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as databricks_ingested_at
FROM parquet.`s3a://openalex-ingest/awards/erc/erc_projects.parquet`;

In [None]:
%sql
-- Check row count
SELECT COUNT(*) as total_grants FROM openalex.awards.erc_raw;

In [None]:
%sql
-- Check sample data
SELECT * FROM openalex.awards.erc_raw LIMIT 5;

In [None]:
%sql
-- Check available columns
DESCRIBE openalex.awards.erc_raw;

## Step 2: Create ERC Awards Table

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.erc_awards
USING delta
AS
WITH
-- Map framework programme to funder_id
-- ERC main: 4320334678
-- H2020 ERC: 4320338335
-- FP7 ERC: 4320338352
-- HORIZON ERC: 4320338453
erc_funders AS (
    SELECT * FROM (
        VALUES
        ('H2020', 4320338335, 'H2020 European Research Council', '10.13039/100010663'),
        ('FP7', 4320338352, 'FP7 Ideas: European Research Council', '10.13039/100011199'),
        ('HORIZON', 4320338453, 'HORIZON EUROPE European Research Council', '10.13039/100019180')
    ) AS t(framework_programme, funder_id, display_name, doi)
),

awards_transformed AS (
    SELECT
        -- Generate unique ID using xxhash64 of funder_id:project_id
        abs(xxhash64(CONCAT(COALESCE(f.funder_id, 4320334678), ':', LOWER(g.project_id)))) % 9000000000 as id,

        -- Display name = project title
        g.title as display_name,

        -- Description from abstract field
        g.abstract as description,

        -- Funder info - use specific sub-funder based on framework programme
        COALESCE(f.funder_id, 4320334678) as funder_id,
        g.project_id as funder_award_id,

        -- Amount in EUR - use EC contribution (not total cost)
        TRY_CAST(REPLACE(g.ec_contribution, ',', '.') AS DOUBLE) as amount,
        'EUR' as currency,

        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', COALESCE(f.funder_id, 4320334678)) as id,
            COALESCE(f.display_name, 'European Research Council') as display_name,
            'https://ror.org/0472cxd90' as ror_id,
            COALESCE(f.doi, '10.13039/501100000781') as doi
        ) as funder,

        -- Funding type - map ERC grant types
        CASE
            WHEN UPPER(g.funding_scheme) LIKE '%STG%' THEN 'fellowship'
            WHEN UPPER(g.funding_scheme) LIKE '%COG%' THEN 'fellowship'
            WHEN UPPER(g.funding_scheme) LIKE '%ADG%' THEN 'fellowship'
            WHEN UPPER(g.funding_scheme) LIKE '%SYG%' THEN 'research'
            WHEN UPPER(g.funding_scheme) LIKE '%POC%' THEN 'commercialization'
            WHEN UPPER(g.funding_scheme) LIKE '%LVG%' THEN 'travel'
            ELSE 'research'
        END as funding_type,

        -- Funder scheme = funding scheme (e.g., ERC-STG, ERC-COG, HORIZON-ERC)
        g.funding_scheme as funder_scheme,

        -- Provenance
        'erc' as provenance,

        -- Dates - CORDIS provides YYYY-MM-DD format
        TRY_TO_DATE(g.start_date, 'yyyy-MM-dd') as start_date,
        TRY_TO_DATE(g.end_date, 'yyyy-MM-dd') as end_date,
        YEAR(TRY_TO_DATE(g.start_date, 'yyyy-MM-dd')) as start_year,
        YEAR(TRY_TO_DATE(g.end_date, 'yyyy-MM-dd')) as end_year,

        -- No PI name in CORDIS data, but we have host institution
        CASE
            WHEN g.host_institution IS NOT NULL THEN
                struct(
                    CAST(NULL AS STRING) as given_name,
                    CAST(NULL AS STRING) as family_name,
                    CAST(NULL AS STRING) as orcid,
                    CAST(NULL AS DATE) as role_start,
                    struct(
                        g.host_institution as name,
                        g.host_country as country,
                        CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>) as ids
                    ) as affiliation
                )
            ELSE NULL
        END as lead_investigator,

        -- No co-lead or other investigators from CORDIS
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,

        CAST(NULL AS ARRAY<STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >>) as investigators,

        -- Landing page URL - construct CORDIS project page
        CONCAT('https://cordis.europa.eu/project/id/', g.project_id) as landing_page_url,

        -- DOI if available from grant_doi field
        CASE
            WHEN g.grant_doi IS NOT NULL AND TRIM(g.grant_doi) != '' 
            THEN g.grant_doi
            ELSE NULL
        END as doi,

        -- Works API URL
        concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(COALESCE(f.funder_id, 4320334678), ':', LOWER(g.project_id)))) % 9000000000) as works_api_url,

        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date

    FROM openalex.awards.erc_raw g
    LEFT JOIN erc_funders f ON g.framework_programme = f.framework_programme
    WHERE g.project_id IS NOT NULL
      AND TRIM(g.project_id) != ''
)

SELECT * FROM awards_transformed;

In [None]:
%sql
-- Remove previous data for this source before inserting fresh data
DELETE FROM openalex.awards.openalex_awards_raw
WHERE provenance = 'erc' AND priority = 21;

-- Insert into openalex_awards_raw with priority
INSERT INTO openalex.awards.openalex_awards_raw
SELECT
    id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    works_api_url,
    created_date,
    updated_date,
    21 as priority
FROM openalex.awards.erc_awards;

## Verification Queries

In [None]:
%sql
-- Check row count
SELECT COUNT(*) as total_erc_awards FROM openalex.awards.erc_awards;

In [None]:
%sql
-- Sample the data
SELECT 
    id,
    display_name,
    funder_award_id,
    funder_scheme,
    funding_type,
    amount,
    currency,
    start_date,
    end_date,
    lead_investigator.affiliation.name as host_institution,
    lead_investigator.affiliation.country as host_country,
    landing_page_url
FROM openalex.awards.erc_awards 
LIMIT 10;

In [None]:
%sql
-- Check funder distribution
SELECT funder.display_name, COUNT(*) as cnt,
       ROUND(SUM(amount)/1e9, 2) as total_billions_eur
FROM openalex.awards.erc_awards
GROUP BY funder.display_name
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funding_type distribution
SELECT funding_type, COUNT(*) as cnt
FROM openalex.awards.erc_awards
GROUP BY funding_type
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funder_scheme distribution
SELECT funder_scheme, COUNT(*) as cnt, 
       ROUND(SUM(amount)/1e9, 2) as total_billions_eur,
       ROUND(AVG(amount)/1e6, 2) as avg_millions_eur
FROM openalex.awards.erc_awards
WHERE funder_scheme IS NOT NULL
GROUP BY funder_scheme
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check data completeness
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(description) as has_description,
    COUNT(amount) as has_amount,
    COUNT(start_date) as has_start_date,
    COUNT(end_date) as has_end_date,
    COUNT(lead_investigator) as has_host_institution,
    ROUND(COUNT(display_name) * 100.0 / COUNT(*), 1) as pct_title,
    ROUND(COUNT(description) * 100.0 / COUNT(*), 1) as pct_description,
    ROUND(COUNT(amount) * 100.0 / COUNT(*), 1) as pct_amount,
    ROUND(COUNT(lead_investigator) * 100.0 / COUNT(*), 1) as pct_host_institution,
    ROUND(SUM(amount)/1e9, 2) as total_billions_eur
FROM openalex.awards.erc_awards;

In [None]:
%sql
-- Check year distribution
SELECT start_year, COUNT(*) as cnt,
       ROUND(SUM(amount)/1e9, 2) as total_billions_eur
FROM openalex.awards.erc_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC
LIMIT 20;

In [None]:
%sql
-- Top host countries
SELECT 
    lead_investigator.affiliation.country as country,
    COUNT(*) as grant_count,
    ROUND(SUM(amount)/1e9, 2) as total_billions_eur
FROM openalex.awards.erc_awards
WHERE lead_investigator.affiliation.country IS NOT NULL
GROUP BY lead_investigator.affiliation.country
ORDER BY total_billions_eur DESC
LIMIT 20;

In [None]:
%sql
-- Top host institutions
SELECT 
    lead_investigator.affiliation.name as institution,
    COUNT(*) as grant_count,
    ROUND(SUM(amount)/1e6, 1) as total_millions_eur
FROM openalex.awards.erc_awards
WHERE lead_investigator.affiliation.name IS NOT NULL
GROUP BY lead_investigator.affiliation.name
ORDER BY total_millions_eur DESC
LIMIT 25;