# Create GACR (Grantová Agentura České Republiky) Awards

Creates Czech Science Foundation grant awards from the IS VaVaI (Czech Research Information System) open data. ~21.6K projects.

**Prerequisites:**
- Run `scripts/local/gacr_to_s3.py` to download and upload the data first.

**Data source:** https://www.isvavai.cz/opendata  
**CEP-projekty.csv:** All Czech research projects (filtered for GA0 = GACR)  
**S3 location:** `s3a://openalex-ingest/awards/gacr/gacr_projects.parquet`

**GACR funder:**
- funder_id: 4320321006
- ROR: https://ror.org/01pv73b02
- DOI: 10.13039/501100001824
- display_name: "Grantová Agentura České Republiky"

**Data notes:**
- Years 1993-2025
- Currency: CZK (Czech Koruna)
- ~62% have ROR IDs for institutions
- Older projects (pre-2010) often lack funding amounts

## Step 1: Create Staging Table from S3

In [0]:
%sql
-- Create the staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.gacr_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as databricks_ingested_at
FROM parquet.`s3a://openalex-ingest/awards/gacr/gacr_projects.parquet`;

In [0]:
%sql
-- Check row count (should be ~21.6K)
SELECT COUNT(*) as total_projects FROM openalex.awards.gacr_raw;

In [0]:
%sql
-- Inspect column names
DESCRIBE openalex.awards.gacr_raw;

In [0]:
%sql
-- Sample the raw data
SELECT * FROM openalex.awards.gacr_raw LIMIT 5;

## Step 2: Create GACR Awards Table

In [0]:
%sql
CREATE OR REPLACE TABLE openalex.awards.gacr_awards
USING delta
AS
WITH
-- Get GACR funder from OpenAlex
gacr_funder AS (
    SELECT
        funder_id,
        display_name,
        ror_id,
        doi
    FROM openalex.common.funder
    WHERE funder_id = 4320321006  -- Grantová Agentura České Republiky
),

awards_transformed AS (
    SELECT
        -- Generate unique ID using xxhash64 of funder_id:project_code
        abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(g.project_code)))) % 9000000000 as id,

        -- Display name = project title (prefer English, fallback to Czech)
        COALESCE(NULLIF(TRIM(g.title_en), ''), g.title_cs, g.title) as display_name,

        -- Description (prefer English, fallback to Czech)
        COALESCE(NULLIF(TRIM(g.description_en), ''), g.description_cs, g.description) as description,

        -- Funder info
        f.funder_id,
        g.project_code as funder_award_id,

        -- Amount in CZK (use state support if available, else total cost)
        COALESCE(g.funding_amount, g.total_cost) as amount,
        'CZK' as currency,

        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', f.funder_id) as id,
            f.display_name,
            f.ror_id,
            f.doi
        ) as funder,

        -- Funding type - map GACR program codes
        CASE
            WHEN UPPER(g.program_code) IN ('GJ', 'GP') THEN 'fellowship'  -- Junior/Postdoc grants
            WHEN UPPER(g.program_code) = 'GX' THEN 'grant'  -- EXPRO excellence grants
            WHEN UPPER(g.program_code) = 'GM' THEN 'grant'  -- International projects
            WHEN UPPER(g.program_code) = 'GC' THEN 'grant'  -- International collaboration
            WHEN UPPER(g.program_code) = 'GA' THEN 'grant'  -- Standard grants
            WHEN UPPER(g.program_code) = 'GB' THEN 'grant'  -- Center of excellence
            WHEN UPPER(g.program_code) = 'GD' THEN 'grant'  -- Bilateral projects
            WHEN UPPER(g.program_code) = 'GF' THEN 'grant'  -- LA projects
            ELSE 'grant'
        END as funding_type,

        -- Funder scheme = program code
        CASE g.program_code
            WHEN 'GA' THEN 'Standard Projects'
            WHEN 'GJ' THEN 'Junior Grants'
            WHEN 'GP' THEN 'Postdoc Grants'
            WHEN 'GX' THEN 'EXPRO Excellence Projects'
            WHEN 'GM' THEN 'International Projects'
            WHEN 'GC' THEN 'International Collaboration'
            WHEN 'GB' THEN 'Center of Excellence'
            WHEN 'GD' THEN 'Bilateral Projects'
            WHEN 'GF' THEN 'LA Projects'
            ELSE g.program_code
        END as funder_scheme,

        -- Provenance
        'isvavai_cep' as provenance,

        -- Dates - construct from year if full date not available
        CASE
            WHEN g.start_date IS NOT NULL THEN TRY_TO_DATE(g.start_date, 'yyyy-MM-dd')
            WHEN TRY_CAST(g.start_year AS INT) IS NOT NULL THEN TRY_TO_DATE(CONCAT(g.start_year, '-01-01'), 'yyyy-MM-dd')
            ELSE NULL
        END as start_date,
        CASE
            WHEN g.end_date IS NOT NULL THEN TRY_TO_DATE(g.end_date, 'yyyy-MM-dd')
            WHEN TRY_CAST(g.end_year AS INT) IS NOT NULL THEN TRY_TO_DATE(CONCAT(g.end_year, '-12-31'), 'yyyy-MM-dd')
            ELSE NULL
        END as end_date,
        TRY_CAST(g.start_year AS INT) as start_year,
        TRY_CAST(g.end_year AS INT) as end_year,

        -- Lead investigator - we don't have PI names in this dataset, only institution
        -- But we can populate the affiliation with institution info
        CASE
            WHEN g.institution_name IS NOT NULL THEN
                struct(
                    CAST(NULL AS STRING) as given_name,
                    CAST(NULL AS STRING) as family_name,
                    CAST(NULL AS STRING) as orcid,
                    CAST(NULL AS DATE) as role_start,
                    struct(
                        g.institution_name as name,
                        CASE g.institution_country
                            WHEN 'CZ' THEN 'Czech Republic'
                            ELSE g.institution_country
                        END as country,
                        CASE
                            WHEN g.ror_id IS NOT NULL THEN
                                array(struct(g.ror_id as id, 'ror' as type, 'isvavai' as asserted_by))
                            ELSE CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>)
                        END as ids
                    ) as affiliation
                )
            ELSE NULL
        END as lead_investigator,

        -- Co-lead and other investigators (not available)
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,

        CAST(NULL AS ARRAY<STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >>) as investigators,

        -- Landing page URL
        g.landing_page_url,

        -- No DOI for GACR grants
        CAST(NULL AS STRING) as doi,

        -- Works API URL
        concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(g.project_code)))) % 9000000000) as works_api_url,

        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date

    FROM openalex.awards.gacr_raw g
    CROSS JOIN gacr_funder f
    WHERE g.project_code IS NOT NULL
)

SELECT * FROM awards_transformed;

## Step 3: Insert into openalex_awards_raw

In [0]:
%sql
-- Remove previous data for this source before inserting fresh data
DELETE FROM openalex.awards.openalex_awards_raw
WHERE provenance = 'isvavai_cep' AND priority = 34;

-- Insert into openalex_awards_raw with priority
INSERT INTO openalex.awards.openalex_awards_raw
SELECT
    id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    works_api_url,
    created_date,
    updated_date,
    34 as priority
FROM openalex.awards.gacr_awards;

## Verification Queries

In [0]:
%sql
-- Check row count (should be ~21.6K)
SELECT COUNT(*) as total_awards FROM openalex.awards.gacr_awards;

In [0]:
%sql
-- Sample the data
SELECT 
    id,
    display_name,
    funder_award_id,
    funder_scheme,
    funding_type,
    amount,
    currency,
    start_year,
    end_year
FROM openalex.awards.gacr_awards 
LIMIT 10;

In [0]:
%sql
-- Check funder distribution (should all be GACR)
SELECT funder.display_name, COUNT(*) as cnt
FROM openalex.awards.gacr_awards
GROUP BY funder.display_name
ORDER BY cnt DESC;

In [0]:
%sql
-- Check program/scheme distribution
SELECT 
    funder_scheme,
    COUNT(*) as cnt,
    ROUND(SUM(amount)/1000000000, 2) as funding_billion_czk
FROM openalex.awards.gacr_awards
GROUP BY funder_scheme
ORDER BY cnt DESC;

In [0]:
%sql
-- Check year distribution
SELECT 
    start_year,
    COUNT(*) as cnt,
    ROUND(SUM(amount)/1000000, 1) as funding_millions_czk
FROM openalex.awards.gacr_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC
LIMIT 20;

In [0]:
%sql
-- Check data completeness
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(description) as has_abstract,
    COUNT(amount) as has_amount,
    COUNT(start_date) as has_start_date,
    COUNT(lead_investigator) as has_institution,
    COUNT(lead_investigator.affiliation.ids) as has_ror,
    ROUND(COUNT(display_name) * 100.0 / COUNT(*), 1) as pct_with_title,
    ROUND(COUNT(amount) * 100.0 / COUNT(*), 1) as pct_with_amount,
    ROUND(SUM(amount)/1000000000, 2) as total_funding_billion_czk
FROM openalex.awards.gacr_awards;

In [0]:
%sql
-- Check top institutions
SELECT 
    lead_investigator.affiliation.name as institution,
    COUNT(*) as cnt,
    ROUND(SUM(amount)/1000000000, 2) as funding_billion_czk
FROM openalex.awards.gacr_awards
WHERE lead_investigator.affiliation.name IS NOT NULL
GROUP BY lead_investigator.affiliation.name
ORDER BY cnt DESC
LIMIT 20;

In [0]:
%sql
-- Verify data was inserted into openalex_awards_raw
SELECT COUNT(*) as gacr_in_raw
FROM openalex.awards.openalex_awards_raw
WHERE provenance = 'isvavai_cep' AND priority = 34;