# Create AEI Awards from BDNS API

Creates AEI (Agencia Estatal de Investigación) awards from the Spanish BDNS API. ~28.8K grants.

**Prerequisites:**
- Run `scripts/local/aei_to_s3.py` to download and upload the data first.

**Data source:** BDNS API (Base de Datos Nacional de Subvenciones)  
**S3 location:** `s3a://openalex-ingest/awards/aei/aei_grants.parquet`

**AEI funder:**
- funder_id: 4320335598
- ROR: NULL
- DOI: 10.13039/501100011033
- display_name: "Agencia Estatal de Investigación"

**Data Notes:**
- BDNS only has AEI data from 2022 onwards
- Data represents individual grant concessions to beneficiary organizations
- Instrument types: SUBVENCIÓN (grant) and PRÉSTAMO (loan)

## Step 1: Create Staging Table from S3

In [None]:
%sql
-- Create the staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.aei_grants_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as databricks_ingested_at
FROM parquet.`s3a://openalex-ingest/awards/aei/aei_grants.parquet`;

In [None]:
%sql
-- Check row count (should be ~28.8K)
SELECT COUNT(*) as total_grants FROM openalex.awards.aei_grants_raw;

In [None]:
%sql
-- Verify actual column names
DESCRIBE openalex.awards.aei_grants_raw;

In [None]:
%sql
-- Sample the raw data
SELECT * FROM openalex.awards.aei_grants_raw LIMIT 5;

## Step 2: Create AEI Awards Table

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.aei_awards
USING delta
AS
WITH
-- Get AEI funder from OpenAlex by explicit funder_id
aei_funder AS (
    SELECT
        funder_id,
        display_name,
        ror_id,
        doi
    FROM openalex.common.funder
    WHERE funder_id = 4320335598  -- Agencia Estatal de Investigación
),

-- Extract organization name from beneficiary (format: NIF + name)
grants_with_org AS (
    SELECT
        *,
        -- Remove NIF prefix (first word) to get org name
        TRIM(REGEXP_REPLACE(beneficiary, '^[A-Z0-9]+\\s+', '')) as org_name
    FROM openalex.awards.aei_grants_raw
)

SELECT
    -- Generate unique ID using xxhash64 of funder_id:grant_code
    abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(g.grant_code)))) % 9000000000 as id,

    -- Display name = call title (this is the program/call description)
    g.call_title as display_name,

    -- Description (same as display_name since title_coofficial is not reliably populated)
    g.call_title as description,

    -- Funder info
    f.funder_id,
    g.grant_code as funder_award_id,

    -- Funding amount (EUR)
    TRY_CAST(g.amount AS DOUBLE) as amount,
    'EUR' as currency,

    -- Funder struct
    struct(
        CONCAT('https://openalex.org/F', f.funder_id) as id,
        f.display_name,
        f.ror_id,
        f.doi
    ) as funder,

    -- Funding type - map from instrument_type
    CASE
        WHEN g.instrument_type LIKE '%PRÉSTAMO%' THEN 'loan'
        WHEN g.instrument_type LIKE '%SUBVENCI%' THEN 'grant'
        ELSE 'grant'
    END as funding_type,

    -- Funder scheme = instrument type
    g.instrument_type as funder_scheme,

    -- Provenance
    'bdns_aei' as provenance,

    -- Dates (grant_date is the concession date)
    TRY_TO_DATE(g.grant_date, 'yyyy-MM-dd') as start_date,
    CAST(NULL AS DATE) as end_date,
    TRY_CAST(g.grant_year AS INT) as start_year,
    CAST(NULL AS INT) as end_year,

    -- Lead investigator - use beneficiary org as affiliation
    struct(
        CAST(NULL AS STRING) as given_name,
        CAST(NULL AS STRING) as family_name,
        CAST(NULL AS STRING) as orcid,
        CAST(NULL AS DATE) as role_start,
        struct(
            g.org_name as name,
            'ES' as country,
            CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>) as ids
        ) as affiliation
    ) as lead_investigator,

    -- Co-lead and other investigators (not available in BDNS)
    CAST(NULL AS STRUCT<
        given_name:STRING,
        family_name:STRING,
        orcid:STRING,
        role_start:DATE,
        affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
    >) as co_lead_investigator,

    CAST(NULL AS ARRAY<STRUCT<
        given_name:STRING,
        family_name:STRING,
        orcid:STRING,
        role_start:DATE,
        affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>
    >>>) as investigators,

    -- Landing page URL (legal basis URL from BOE)
    g.legal_basis_url as landing_page_url,

    -- No DOI for BDNS grants
    CAST(NULL AS STRING) as doi,

    -- Works API URL
    concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(g.grant_code)))) % 9000000000) as works_api_url,

    -- Timestamps
    current_timestamp() as created_date,
    current_timestamp() as updated_date

FROM grants_with_org g
CROSS JOIN aei_funder f
WHERE g.grant_code IS NOT NULL
  AND TRIM(g.grant_code) != '';

In [None]:
%sql
-- Remove previous data for this source before inserting fresh data
DELETE FROM openalex.awards.openalex_awards_raw
WHERE provenance = 'bdns_aei' AND priority = 31;

-- Insert into openalex_awards_raw with priority 31
INSERT INTO openalex.awards.openalex_awards_raw
SELECT
    id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    works_api_url,
    created_date,
    updated_date,
    31 as priority
FROM openalex.awards.aei_awards;

## Verification Queries

In [None]:
%sql
-- Check row count (should be ~28.8K)
SELECT COUNT(*) as total_aei_awards FROM openalex.awards.aei_awards;

In [None]:
%sql
-- Sample the data
SELECT 
    id,
    SUBSTRING(display_name, 1, 80) as display_name,
    funder_award_id,
    amount,
    currency,
    funding_type,
    start_date,
    lead_investigator.affiliation.name as org_name
FROM openalex.awards.aei_awards 
LIMIT 10;

In [None]:
%sql
-- Check funder distribution (should all be AEI)
SELECT funder.display_name, COUNT(*) as cnt
FROM openalex.awards.aei_awards
GROUP BY funder.display_name
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funding_type distribution
SELECT funding_type, COUNT(*) as cnt, ROUND(SUM(amount), 0) as total_funding
FROM openalex.awards.aei_awards
GROUP BY funding_type
ORDER BY cnt DESC;

In [None]:
%sql
-- Check data completeness and total funding
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(amount) as has_amount,
    COUNT(start_date) as has_start_date,
    ROUND(SUM(amount), 0) as total_funding_eur,
    ROUND(AVG(amount), 0) as avg_funding_eur,
    ROUND(COUNT(display_name) * 100.0 / COUNT(*), 1) as pct_with_title
FROM openalex.awards.aei_awards;

In [None]:
%sql
-- Check year distribution
SELECT start_year, COUNT(*) as cnt, ROUND(SUM(amount), 0) as total_funding
FROM openalex.awards.aei_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC;

In [None]:
%sql
-- Check top beneficiary organizations
SELECT 
    lead_investigator.affiliation.name as org_name,
    COUNT(*) as cnt,
    ROUND(SUM(amount), 0) as total_funding
FROM openalex.awards.aei_awards
WHERE lead_investigator.affiliation.name IS NOT NULL
GROUP BY lead_investigator.affiliation.name
ORDER BY total_funding DESC
LIMIT 20;