# Create DOE Awards from USAspending

Creates Department of Energy awards from the USAspending.gov bulk download data. ~45K unique awards.

**Prerequisites:**
- Run `scripts/local/doe_to_s3.py` to download and upload the data first.

**Data source:** https://www.usaspending.gov/  
**API docs:** https://api.usaspending.gov/  
**S3 location:** `s3a://openalex-ingest/awards/doe/doe_awards.parquet`

**DOE funder:**
- funder_id: 4320306084
- ROR: https://ror.org/01bj3aw27
- DOI: 10.13039/100000015
- display_name: "U.S. Department of Energy"

**Award types (CFDA programs):**
- 02: Block Grant
- 03: Formula Grant
- 04: Project Grant
- 05: Cooperative Agreement

**Note:** FY2019-2025 data unavailable due to USAspending API issues.

## Step 1: Create Staging Table from S3

In [None]:
%sql
-- Create the staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.doe_awards_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as databricks_ingested_at
FROM parquet.`s3a://openalex-ingest/awards/doe/doe_awards.parquet`;

In [None]:
%sql
-- Check row count (should be ~45K)
SELECT COUNT(*) as total_awards FROM openalex.awards.doe_awards_raw;

In [None]:
%sql
-- Sample the raw data
SELECT 
    award_id_fain,
    prime_award_base_transaction_description,
    total_obligated_amount,
    period_of_performance_start_date,
    period_of_performance_current_end_date,
    recipient_name,
    cfda_title,
    usaspending_permalink
FROM openalex.awards.doe_awards_raw 
LIMIT 5;

## Step 2: Create DOE Awards Table

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.doe_awards
USING delta
AS
WITH
-- Get DOE funder from OpenAlex by funder_id
doe_funder AS (
    SELECT
        funder_id,
        display_name,
        ror_id,
        doi
    FROM openalex.common.funder
    WHERE funder_id = 4320306084  -- U.S. Department of Energy
),

awards_transformed AS (
    SELECT
        -- Generate unique ID using xxhash64 of funder_id:award_id_fain
        abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(g.award_id_fain)))) % 9000000000 as id,

        -- Display name = transaction description (title)
        COALESCE(g.prime_award_base_transaction_description, g.transaction_description) as display_name,

        -- Description (USAspending doesn't have separate abstracts)
        COALESCE(g.prime_award_base_transaction_description, g.transaction_description) as description,

        -- Funder info
        f.funder_id,
        g.award_id_fain as funder_award_id,

        -- Amount (in USD)
        CAST(g.total_obligated_amount AS DOUBLE) as amount,
        'USD' as currency,

        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', f.funder_id) as id,
            f.display_name,
            f.ror_id,
            f.doi
        ) as funder,

        -- Funding type based on assistance_type_code
        CASE
            WHEN g.assistance_type_code = '02' THEN 'grant'  -- Block Grant
            WHEN g.assistance_type_code = '03' THEN 'grant'  -- Formula Grant
            WHEN g.assistance_type_code = '04' THEN 'grant'  -- Project Grant
            WHEN g.assistance_type_code = '05' THEN 'grant'  -- Cooperative Agreement
            ELSE 'grant'
        END as funding_type,

        -- Funder scheme = CFDA program title
        g.cfda_title as funder_scheme,

        -- Provenance
        'usaspending' as provenance,

        -- Dates
        TRY_TO_DATE(g.period_of_performance_start_date, 'yyyy-MM-dd') as start_date,
        TRY_TO_DATE(g.period_of_performance_current_end_date, 'yyyy-MM-dd') as end_date,
        YEAR(TRY_TO_DATE(g.period_of_performance_start_date, 'yyyy-MM-dd')) as start_year,
        YEAR(TRY_TO_DATE(g.period_of_performance_current_end_date, 'yyyy-MM-dd')) as end_year,

        -- Lead investigator - USAspending doesn't have PI info, only recipient org
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as lead_investigator,

        -- Co-lead and other investigators (not available in USAspending)
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,

        CAST(NULL AS ARRAY<STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >>) as investigators,

        -- Landing page URL (USAspending permalink)
        g.usaspending_permalink as landing_page_url,

        -- No DOI for USAspending grants
        CAST(NULL AS STRING) as doi,

        -- Works API URL
        concat('https://api.openalex.org/works?filter=awards.id:G', abs(xxhash64(CONCAT(f.funder_id, ':', LOWER(g.award_id_fain)))) % 9000000000) as works_api_url,

        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date,

        -- Keep recipient info for potential future use
        g.recipient_name,
        g.recipient_state_name,
        g.recipient_country_name

    FROM openalex.awards.doe_awards_raw g
    CROSS JOIN doe_funder f
    WHERE g.award_id_fain IS NOT NULL
      AND TRIM(g.award_id_fain) != ''
)

SELECT * FROM awards_transformed;

In [None]:
%sql
-- Remove previous data for this source before inserting fresh data
DELETE FROM openalex.awards.openalex_awards_raw
WHERE provenance = 'usaspending' AND funder_id = 4320306084;

-- Insert into openalex_awards_raw with priority
INSERT INTO openalex.awards.openalex_awards_raw
SELECT
    id,
    display_name,
    description,
    funder_id,
    funder_award_id,
    amount,
    currency,
    funder,
    funding_type,
    funder_scheme,
    provenance,
    start_date,
    end_date,
    start_year,
    end_year,
    lead_investigator,
    co_lead_investigator,
    investigators,
    landing_page_url,
    doi,
    works_api_url,
    created_date,
    updated_date,
    25 as priority  -- DOE priority
FROM openalex.awards.doe_awards;

## Verification Queries

In [None]:
%sql
-- Check row count (should be ~45K)
SELECT COUNT(*) as total_doe_awards FROM openalex.awards.doe_awards;

In [None]:
%sql
-- Sample the data
SELECT 
    id,
    display_name,
    funder_award_id,
    funder_scheme,
    funding_type,
    amount,
    start_date,
    end_date,
    recipient_name
FROM openalex.awards.doe_awards 
LIMIT 10;

In [None]:
%sql
-- Check funder distribution (should all be DOE)
SELECT funder.display_name, COUNT(*) as cnt
FROM openalex.awards.doe_awards
GROUP BY funder.display_name
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funding_type distribution
SELECT funding_type, COUNT(*) as cnt
FROM openalex.awards.doe_awards
GROUP BY funding_type
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funder_scheme distribution (top 20 CFDA programs)
SELECT funder_scheme, COUNT(*) as cnt, SUM(amount) as total_funding
FROM openalex.awards.doe_awards
WHERE funder_scheme IS NOT NULL
GROUP BY funder_scheme
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check data completeness
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(amount) as has_amount,
    COUNT(start_date) as has_start_date,
    COUNT(end_date) as has_end_date,
    COUNT(landing_page_url) as has_url,
    SUM(amount) as total_funding,
    ROUND(COUNT(amount) * 100.0 / COUNT(*), 1) as pct_with_amount,
    ROUND(COUNT(start_date) * 100.0 / COUNT(*), 1) as pct_with_start_date
FROM openalex.awards.doe_awards;

In [None]:
%sql
-- Check year distribution
SELECT start_year, COUNT(*) as cnt, SUM(amount) as total_funding
FROM openalex.awards.doe_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC
LIMIT 25;

In [None]:
%sql
-- Check recipient institutions
SELECT 
    recipient_name,
    recipient_state_name,
    COUNT(*) as cnt,
    SUM(amount) as total_funding
FROM openalex.awards.doe_awards
WHERE recipient_name IS NOT NULL
GROUP BY recipient_name, recipient_state_name
ORDER BY total_funding DESC
LIMIT 20;

In [None]:
%sql
-- Check state distribution
SELECT 
    recipient_state_name,
    COUNT(*) as cnt,
    SUM(amount) as total_funding
FROM openalex.awards.doe_awards
WHERE recipient_state_name IS NOT NULL
GROUP BY recipient_state_name
ORDER BY total_funding DESC
LIMIT 20;