# Create BMBF Awards from Förderkatalog

Creates BMBF awards from the German Förderkatalog (Federal Research Funding Catalog). ~46K projects.

**Prerequisites:**
- Run `scripts/local/bmbf_to_s3.py` to download and upload the data first.

**Data source:** https://foerderportal.bund.de/foekat/  
**S3 location:** `s3a://openalex-ingest/awards/bmbf/bmbf_projects.parquet`

**BMBF funder in OpenAlex:**
- funder_id: 4320321114
- display_name: "Bundesministerium für Bildung und Forschung"
- ror_id: "https://ror.org/04pz7b180"
- doi: "10.13039/501100002347"

**Note:** The Förderkatalog contains projects from multiple German federal ministries (BMFTR/BMBF, BMWK, BMVD, BMUV). All are included here since the Förderkatalog is maintained by BMBF. The `ressort` column in the raw table identifies the specific ministry.

**Input columns from bmbf_to_s3.py:**
- fkz -> funder_award_id (Förderkennzeichen = grant reference number)
- title -> display_name
- amount (DOUBLE, in EUR)
- currency (STRING, always EUR)
- ressort (STRING, ministry name)
- referat (STRING, department)
- projekttraeger (STRING, project management agency)
- start_date (STRING, YYYY-MM-DD)
- end_date (STRING, YYYY-MM-DD)
- leistungsplan (STRING, subject area / performance plan)
- foerderart (STRING, funding type)
- foerderprofil (STRING, funding program/profile)
- verbund (STRING, consortium name if applicable)
- zuwendungsempfaenger (STRING, grant recipient institution)
- ort (STRING, institution city)
- bundesland (STRING, federal state)
- staat (STRING, country)
- ausfuehrende_stelle (STRING, executing organization)
- ausfuehrende_stelle_ort (STRING, executing org city)
- landing_page_url (STRING, link to Förderkatalog detail page)
- ingested_at (STRING, timestamp of download)

## Step 1: Create Staging Table from S3

In [None]:
%sql
-- Create the staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.bmbf_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as databricks_ingested_at
FROM parquet.`s3a://openalex-ingest/awards/bmbf/bmbf_projects.parquet`;

In [None]:
%sql
-- Check row count (should be ~46K)
SELECT COUNT(*) as total_projects FROM openalex.awards.bmbf_raw;

In [None]:
%sql
-- Check column names to verify schema
DESCRIBE openalex.awards.bmbf_raw;

In [None]:
%sql
-- Sample the raw data
SELECT * FROM openalex.awards.bmbf_raw LIMIT 5;

In [None]:
%sql
-- Check ministry (ressort) distribution
SELECT ressort, COUNT(*) as cnt
FROM openalex.awards.bmbf_raw
GROUP BY ressort
ORDER BY cnt DESC;

## Step 2: Create BMBF Awards Table

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.bmbf_awards
USING delta
AS
WITH
-- Get BMBF funder from OpenAlex by explicit funder_id
bmbf_funder AS (
    SELECT
        funder_id,
        display_name,
        ror_id,
        doi
    FROM openalex.common.funder
    WHERE funder_id = 4320321114  -- Bundesministerium für Bildung und Forschung
),

awards_transformed AS (
    SELECT
        -- Generate unique ID using xxhash64 of funder_id:fkz
        abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(b.fkz)))) % 9000000000 as id,

        -- Display name = project title
        b.title as display_name,

        -- No description available in Förderkatalog data
        CAST(NULL AS STRING) as description,

        -- Funder info
        f.funder_id,
        b.fkz as funder_award_id,

        -- Amount (in EUR)
        TRY_CAST(b.amount AS DOUBLE) as amount,
        COALESCE(b.currency, 'EUR') as currency,

        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', f.funder_id) as id,
            f.display_name,
            f.ror_id,
            f.doi
        ) as funder,

        -- Funding type - map from German foerderart
        CASE
            WHEN LOWER(b.foerderart) LIKE '%stipend%' THEN 'fellowship'
            WHEN LOWER(b.foerderart) LIKE '%ausbildung%' THEN 'training'
            WHEN LOWER(b.foerderart) LIKE '%infrastruktur%' THEN 'infrastructure'
            WHEN LOWER(b.foerderart) LIKE '%zuwendung%' THEN 'grant'
            WHEN LOWER(b.foerderart) LIKE '%zuschuss%' THEN 'grant'
            WHEN LOWER(b.foerderart) LIKE '%darlehen%' THEN 'grant'
            ELSE 'grant'
        END as funding_type,

        -- Funder scheme = funding program/profile
        b.foerderprofil as funder_scheme,

        -- Provenance
        'foerderkatalog' as provenance,

        -- Dates (stored as strings in YYYY-MM-DD format by bmbf_to_s3.py)
        TRY_TO_DATE(b.start_date, 'yyyy-MM-dd') as start_date,
        TRY_TO_DATE(b.end_date, 'yyyy-MM-dd') as end_date,
        YEAR(TRY_TO_DATE(b.start_date, 'yyyy-MM-dd')) as start_year,
        YEAR(TRY_TO_DATE(b.end_date, 'yyyy-MM-dd')) as end_year,

        -- Lead investigator - institution from zuwendungsempfaenger
        -- Note: Förderkatalog has institution data but not individual PI names
        CASE
            WHEN b.zuwendungsempfaenger IS NOT NULL AND TRIM(b.zuwendungsempfaenger) != '' THEN
                struct(
                    CAST(NULL AS STRING) as given_name,
                    CAST(NULL AS STRING) as family_name,
                    CAST(NULL AS STRING) as orcid,
                    CAST(NULL AS DATE) as role_start,
                    struct(
                        b.zuwendungsempfaenger as name,
                        COALESCE(b.staat, 'Germany') as country,
                        CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>) as ids
                    ) as affiliation
                )
            ELSE NULL
        END as lead_investigator,

        -- Co-lead and other investigators (not available in Förderkatalog)
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,

        CAST(NULL AS ARRAY<STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >>) as investigators,

        -- Landing page URL from Förderkatalog
        b.landing_page_url as landing_page_url,

        -- No DOI for Förderkatalog grants
        CAST(NULL AS STRING) as doi,

        -- Works API URL
        concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(b.fkz)))) % 9000000000) as works_api_url,

        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date

    FROM openalex.awards.bmbf_raw b
    CROSS JOIN bmbf_funder f
    WHERE b.fkz IS NOT NULL
      AND TRIM(CAST(b.fkz AS STRING)) != ''
)

SELECT * FROM awards_transformed;

In [None]:
%sql
-- Remove previous data for this source before inserting fresh data
DELETE FROM openalex.awards.openalex_awards_raw
WHERE provenance = 'foerderkatalog' AND priority = 36;

-- Insert into openalex_awards_raw with priority
INSERT INTO openalex.awards.openalex_awards_raw
SELECT
    id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    works_api_url,
    created_date,
    updated_date,
    36 as priority
FROM openalex.awards.bmbf_awards;

## Verification Queries

In [None]:
%sql
-- Check row count (should be ~46K)
SELECT COUNT(*) as total_bmbf_awards FROM openalex.awards.bmbf_awards;

In [None]:
%sql
-- Sample the data
SELECT 
    id,
    display_name,
    funder_award_id,
    funder_scheme,
    funding_type,
    amount,
    start_date,
    end_date,
    lead_investigator
FROM openalex.awards.bmbf_awards 
LIMIT 10;

In [None]:
%sql
-- Check funder distribution (should all be BMBF)
SELECT funder.display_name, COUNT(*) as cnt
FROM openalex.awards.bmbf_awards
GROUP BY funder.display_name
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funding_type distribution
SELECT funding_type, COUNT(*) as cnt
FROM openalex.awards.bmbf_awards
GROUP BY funding_type
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funder_scheme distribution (top 20 funding profiles)
SELECT funder_scheme, COUNT(*) as cnt
FROM openalex.awards.bmbf_awards
WHERE funder_scheme IS NOT NULL
GROUP BY funder_scheme
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check data completeness
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(description) as has_description,
    COUNT(amount) as has_amount,
    COUNT(start_date) as has_start_date,
    COUNT(end_date) as has_end_date,
    COUNT(lead_investigator) as has_institution,
    ROUND(COUNT(lead_investigator) * 100.0 / COUNT(*), 1) as pct_with_institution,
    ROUND(COUNT(amount) * 100.0 / COUNT(*), 1) as pct_with_amount
FROM openalex.awards.bmbf_awards;

In [None]:
%sql
-- Check year distribution
SELECT start_year, COUNT(*) as cnt
FROM openalex.awards.bmbf_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC
LIMIT 20;

In [None]:
%sql
-- Check institutions (top 20)
SELECT 
    lead_investigator.affiliation.name as institution,
    COUNT(*) as grant_count
FROM openalex.awards.bmbf_awards
WHERE lead_investigator.affiliation.name IS NOT NULL
GROUP BY lead_investigator.affiliation.name
ORDER BY grant_count DESC
LIMIT 20;