# Create ANID/FONDECYT (Chile) Awards

Creates Chilean research grant awards from ANID's historical projects database. ~47K projects from 1982-2025.

**Prerequisites:**
- Run `scripts/local/anid_to_s3.py` to download and upload the data first.

**Data source:** https://github.com/ANID-GITHUB/Historico-de-Proyectos-Adjudicados  
**S3 location:** `s3a://openalex-ingest/awards/anid/anid_projects.parquet`

**ANID funder:**
- funder_id: 4320331146
- DOI: 10.13039/501100020884
- display_name: "Agencia Nacional de Investigación y Desarrollo"

**FONDECYT funder (main program):**
- funder_id: 4320338073
- ROR: https://ror.org/02ap3w078
- DOI: 10.13039/501100002850
- display_name: "Fondo Nacional de Desarrollo Científico y Tecnológico"

**Data notes:**
- Years 1982-2025
- Currency: CLP (Chilean Pesos)
- 19 distinct programs, 1,659 institutions
- FONDECYT is the main research grant program within ANID

## Step 1: Create Staging Table from S3

In [None]:
%sql
-- Create the staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.anid_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as databricks_ingested_at
FROM parquet.`s3a://openalex-ingest/awards/anid/anid_projects.parquet`;

In [None]:
%sql
-- Check row count (should be ~47K)
SELECT COUNT(*) as total_projects FROM openalex.awards.anid_raw;

In [None]:
%sql
-- Inspect column names
DESCRIBE openalex.awards.anid_raw;

In [None]:
%sql
-- Sample the raw data
SELECT * FROM openalex.awards.anid_raw LIMIT 5;

## Step 2: Create ANID Awards Table

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.anid_awards
USING delta
AS
WITH
-- Get ANID funder from OpenAlex (using parent org)
anid_funder AS (
    SELECT
        funder_id,
        display_name,
        ror_id,
        doi
    FROM openalex.common.funder
    WHERE funder_id = 4320331146  -- ANID
),

awards_transformed AS (
    SELECT
        -- Generate unique ID using xxhash64 of funder_id:project_code
        abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(a.project_code)))) % 9000000000 as id,

        -- Display name = project title
        NULLIF(TRIM(a.title), '') as display_name,

        -- No description/abstract available in this dataset
        CAST(NULL AS STRING) as description,

        -- Funder info
        f.funder_id,
        a.project_code as funder_award_id,

        -- Amount in CLP
        a.amount,
        'CLP' as currency,

        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', f.funder_id) as id,
            f.display_name,
            f.ror_id,
            f.doi
        ) as funder,

        -- Funding type - map based on program
        CASE
            WHEN UPPER(a.program) LIKE '%POSTDOC%' THEN 'fellowship'
            WHEN UPPER(a.program) LIKE '%DOCTORADO%' THEN 'fellowship'
            WHEN UPPER(a.program) LIKE '%BECAS%' THEN 'fellowship'
            WHEN UPPER(a.program) LIKE '%EQUIPAMIENTO%' THEN 'equipment'
            WHEN UPPER(a.program) LIKE '%INFRAESTRUCTURA%' THEN 'infrastructure'
            ELSE 'research'
        END as funding_type,

        -- Funder scheme = program + instrument
        CASE
            WHEN a.instrument IS NOT NULL THEN CONCAT(COALESCE(a.program, ''), ' - ', a.instrument)
            ELSE a.program
        END as funder_scheme,

        -- Provenance
        'anid_github' as provenance,

        -- Dates - construct from year
        CASE
            WHEN TRY_CAST(a.start_year AS INT) IS NOT NULL THEN TRY_TO_DATE(CONCAT(a.start_year, '-01-01'), 'yyyy-MM-dd')
            ELSE NULL
        END as start_date,
        CASE
            WHEN TRY_CAST(a.end_year AS INT) IS NOT NULL THEN TRY_TO_DATE(CONCAT(a.end_year, '-12-31'), 'yyyy-MM-dd')
            ELSE NULL
        END as end_date,
        TRY_CAST(a.start_year AS INT) as start_year,
        TRY_CAST(a.end_year AS INT) as end_year,

        -- Lead investigator from PI name and institution
        CASE
            WHEN a.pi_name IS NOT NULL THEN
                struct(
                    a.pi_given_name as given_name,
                    a.pi_family_name as family_name,
                    CAST(NULL AS STRING) as orcid,
                    CAST(NULL AS DATE) as role_start,
                    CASE
                        WHEN a.institution IS NOT NULL THEN
                            struct(
                                a.institution as name,
                                'Chile' as country,
                                CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>) as ids
                            )
                        ELSE CAST(NULL AS STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>)
                    END as affiliation
                )
            WHEN a.institution IS NOT NULL THEN
                struct(
                    CAST(NULL AS STRING) as given_name,
                    CAST(NULL AS STRING) as family_name,
                    CAST(NULL AS STRING) as orcid,
                    CAST(NULL AS DATE) as role_start,
                    struct(
                        a.institution as name,
                        'Chile' as country,
                        CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>) as ids
                    ) as affiliation
                )
            ELSE NULL
        END as lead_investigator,

        -- Co-lead and other investigators (not available)
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,

        CAST(NULL AS ARRAY<STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >>) as investigators,

        -- Landing page URL (source repository)
        a.source_url as landing_page_url,

        -- No DOI for ANID grants
        CAST(NULL AS STRING) as doi,

        -- Works API URL
        concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(a.project_code)))) % 9000000000) as works_api_url,

        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date

    FROM openalex.awards.anid_raw a
    CROSS JOIN anid_funder f
    WHERE a.project_code IS NOT NULL
)

SELECT * FROM awards_transformed;

## Step 3: Insert into openalex_awards_raw

In [None]:
%sql
-- Remove previous data for this source before inserting fresh data
DELETE FROM openalex.awards.openalex_awards_raw
WHERE provenance = 'anid_github' AND priority = 35;

-- Insert into openalex_awards_raw with priority
INSERT INTO openalex.awards.openalex_awards_raw
SELECT
    id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    works_api_url,
    created_date,
    updated_date,
    35 as priority
FROM openalex.awards.anid_awards;

## Verification Queries

In [None]:
%sql
-- Check row count (should be ~47K)
SELECT COUNT(*) as total_awards FROM openalex.awards.anid_awards;

In [None]:
%sql
-- Sample the data
SELECT 
    id,
    display_name,
    funder_award_id,
    funder_scheme,
    funding_type,
    amount,
    currency,
    start_year,
    end_year,
    lead_investigator.given_name,
    lead_investigator.family_name
FROM openalex.awards.anid_awards 
LIMIT 10;

In [None]:
%sql
-- Check funder distribution (should all be ANID)
SELECT funder.display_name, COUNT(*) as cnt
FROM openalex.awards.anid_awards
GROUP BY funder.display_name
ORDER BY cnt DESC;

In [None]:
%sql
-- Check program distribution
SELECT 
    funder_scheme,
    COUNT(*) as cnt,
    ROUND(SUM(amount)/1000000000, 2) as funding_billion_clp
FROM openalex.awards.anid_awards
GROUP BY funder_scheme
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check year distribution
SELECT 
    start_year,
    COUNT(*) as cnt,
    ROUND(SUM(amount)/1000000, 1) as funding_millions_clp
FROM openalex.awards.anid_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC
LIMIT 20;

In [None]:
%sql
-- Check data completeness
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(description) as has_abstract,
    COUNT(amount) as has_amount,
    COUNT(start_date) as has_start_date,
    COUNT(lead_investigator) as has_pi,
    COUNT(lead_investigator.affiliation) as has_institution,
    ROUND(COUNT(display_name) * 100.0 / COUNT(*), 1) as pct_with_title,
    ROUND(COUNT(amount) * 100.0 / COUNT(*), 1) as pct_with_amount,
    ROUND(SUM(amount)/1000000000, 2) as total_funding_billion_clp
FROM openalex.awards.anid_awards;

In [None]:
%sql
-- Check top institutions
SELECT 
    lead_investigator.affiliation.name as institution,
    COUNT(*) as cnt,
    ROUND(SUM(amount)/1000000000, 2) as funding_billion_clp
FROM openalex.awards.anid_awards
WHERE lead_investigator.affiliation.name IS NOT NULL
GROUP BY lead_investigator.affiliation.name
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Verify data was inserted into openalex_awards_raw
SELECT COUNT(*) as anid_in_raw
FROM openalex.awards.openalex_awards_raw
WHERE provenance = 'anid_github' AND priority = 35;