# Create SSHRC Awards

Creates SSHRC (Social Sciences and Humanities Research Council of Canada) awards from open data. ~114K unique awards.

**Prerequisites:**
- Run `scripts/local/sshrc_to_s3.py` to download and upload the data first.

**Data source:** https://open.canada.ca/data/en/dataset/b4e2b302-9bc6-4b33-b880-6496f8cef0f1  
**S3 location:** `s3a://openalex-ingest/awards/sshrc/sshrc_projects.parquet`

**SSHRC funder:**
- funder_id: 4320334617
- display_name: "Social Sciences and Humanities Research Council of Canada"
- ROR: https://ror.org/04j5jqy92
- DOI: 10.13039/501100000155

## Step 1: Create Staging Table from S3

In [None]:
%sql
-- Create the staging table from S3 parquet
CREATE OR REPLACE TABLE openalex.awards.sshrc_raw
USING delta
AS
SELECT
    *,
    current_timestamp() as databricks_ingested_at
FROM parquet.`s3a://openalex-ingest/awards/sshrc/sshrc_projects.parquet`;

In [None]:
%sql
-- Check row count (should be ~114K)
SELECT COUNT(*) as total_projects FROM openalex.awards.sshrc_raw;

In [None]:
%sql
-- Sample the raw data
SELECT * FROM openalex.awards.sshrc_raw LIMIT 5;

## Step 2: Create SSHRC Awards Table

In [None]:
%sql
CREATE OR REPLACE TABLE openalex.awards.sshrc_awards
USING delta
AS
WITH
-- Get SSHRC funder from OpenAlex by explicit funder_id
sshrc_funder AS (
    SELECT
        funder_id,
        display_name,
        ror_id,
        doi
    FROM openalex.common.funder
    WHERE funder_id = 4320334617  -- Social Sciences and Humanities Research Council of Canada
),

awards_transformed AS (
    SELECT
        -- Generate unique ID: funder_id:file_number
        CONCAT(f.funder_id, ':', LOWER(TRIM(g.file_number))) as id,

        -- Display name = project title
        g.title as display_name,

        -- No abstract in SSHRC open data
        CAST(NULL AS STRING) as description,

        -- Funder info
        f.funder_id,
        TRIM(g.file_number) as funder_award_id,

        -- Amount (total across all fiscal years)
        CAST(g.amount AS DOUBLE) as amount,
        'CAD' as currency,

        -- Funder struct
        struct(
            CONCAT('https://openalex.org/F', f.funder_id) as id,
            f.display_name,
            f.ror_id,
            f.doi
        ) as funder,

        -- Funding type - map common SSHRC programs to types
        CASE
            WHEN LOWER(g.program) LIKE '%doctoral%' THEN 'fellowship'
            WHEN LOWER(g.program) LIKE '%postdoctoral%' THEN 'fellowship'
            WHEN LOWER(g.program) LIKE '%fellowship%' THEN 'fellowship'
            WHEN LOWER(g.program) LIKE '%scholarship%' THEN 'fellowship'
            WHEN LOWER(g.program) LIKE '%vanier%' THEN 'fellowship'
            WHEN LOWER(g.program) LIKE '%banting%' THEN 'fellowship'
            WHEN LOWER(g.program) LIKE '%talent%' THEN 'fellowship'
            WHEN LOWER(g.program) LIKE '%canada research chair%' THEN 'fellowship'
            WHEN LOWER(g.program) LIKE '%partnership%' THEN 'research'
            WHEN LOWER(g.program) LIKE '%insight%' THEN 'research'
            WHEN LOWER(g.program) LIKE '%connection%' THEN 'research'
            WHEN LOWER(g.program) LIKE '%aid%' THEN 'research'
            WHEN LOWER(g.program) LIKE '%savoir%' THEN 'research'
            WHEN LOWER(g.program) LIKE '%sshrc %grant%' THEN 'grant'
            WHEN LOWER(g.program) LIKE '%general research%' THEN 'research'
            WHEN LOWER(g.program) LIKE '%major collaborative%' THEN 'research'
            WHEN LOWER(g.program) LIKE '%standard research%' THEN 'research'
            WHEN LOWER(g.program) LIKE '%strategic%' THEN 'research'
            ELSE 'grant'
        END as funding_type,

        -- Funder scheme = program name
        g.program as funder_scheme,

        -- Provenance
        'sshrc_opendata' as provenance,

        -- Dates - SSHRC provides competition_year, approximate as April 1 of that year
        -- (SSHRC fiscal year ends March 31, so awards typically start April 1)
        CASE 
            WHEN g.start_year IS NOT NULL THEN TRY_TO_DATE(CONCAT(CAST(CAST(g.start_year AS INT) AS STRING), '-04-01'))
            ELSE NULL
        END as start_date,
        CAST(NULL AS DATE) as end_date,  -- Not available in SSHRC open data
        CAST(g.start_year AS INT) as start_year,
        CAST(NULL AS INT) as end_year,

        -- Lead investigator - SSHRC provides full name only, not structured
        CASE
            WHEN g.applicant_name IS NOT NULL AND TRIM(g.applicant_name) != '' THEN
                struct(
                    CAST(NULL AS STRING) as given_name,  -- Not available separately
                    TRIM(g.applicant_name) as family_name,  -- Store full name in family_name
                    CAST(NULL AS STRING) as orcid,
                    CAST(NULL AS DATE) as role_start,
                    struct(
                        TRIM(g.institution) as name,
                        'Canada' as country,
                        CAST(NULL AS ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>) as ids
                    ) as affiliation
                )
            ELSE NULL
        END as lead_investigator,

        -- Co-lead and other investigators (not available in SSHRC open data)
        CAST(NULL AS STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >) as co_lead_investigator,

        CAST(NULL AS ARRAY<STRUCT<
            given_name:STRING,
            family_name:STRING,
            orcid:STRING,
            role_start:DATE,
            affiliation:STRUCT<name:STRING, country:STRING, ids:ARRAY<STRUCT<id:STRING, type:STRING, asserted_by:STRING>>>
        >>) as investigators,

        -- Landing page URL - SSHRC doesn't have individual award pages
        -- Link to the open data portal instead
        'https://open.canada.ca/data/en/dataset/b4e2b302-9bc6-4b33-b880-6496f8cef0f1' as landing_page_url,

        -- No DOI for SSHRC grants
        CAST(NULL AS STRING) as doi,

        -- Timestamps
        current_timestamp() as created_date,
        current_timestamp() as updated_date

    FROM openalex.awards.sshrc_raw g
    CROSS JOIN sshrc_funder f
    WHERE g.file_number IS NOT NULL
      AND TRIM(g.file_number) != ''
)

SELECT * FROM awards_transformed;

## Verification Queries

In [None]:
%sql
-- Check row count (should be ~114K)
SELECT COUNT(*) as total_sshrc_awards FROM openalex.awards.sshrc_awards;

In [None]:
%sql
-- Sample the data
SELECT 
    id,
    display_name,
    funder_award_id,
    funder_scheme,
    funding_type,
    amount,
    currency,
    start_date,
    lead_investigator
FROM openalex.awards.sshrc_awards 
LIMIT 10;

In [None]:
%sql
-- Check funder distribution (should all be SSHRC)
SELECT funder.display_name, COUNT(*) as cnt
FROM openalex.awards.sshrc_awards
GROUP BY funder.display_name
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funding_type distribution
SELECT funding_type, COUNT(*) as cnt
FROM openalex.awards.sshrc_awards
GROUP BY funding_type
ORDER BY cnt DESC;

In [None]:
%sql
-- Check funder_scheme distribution (top 20 funding programmes)
SELECT funder_scheme, COUNT(*) as cnt
FROM openalex.awards.sshrc_awards
WHERE funder_scheme IS NOT NULL
GROUP BY funder_scheme
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check data completeness
SELECT
    COUNT(*) as total,
    COUNT(display_name) as has_title,
    COUNT(description) as has_abstract,
    COUNT(amount) as has_amount,
    COUNT(start_date) as has_start_date,
    COUNT(lead_investigator) as has_pi,
    ROUND(COUNT(display_name) * 100.0 / COUNT(*), 1) as pct_title,
    ROUND(COUNT(amount) * 100.0 / COUNT(*), 1) as pct_amount,
    ROUND(COUNT(start_date) * 100.0 / COUNT(*), 1) as pct_dates
FROM openalex.awards.sshrc_awards;

In [None]:
%sql
-- Check year distribution
SELECT start_year, COUNT(*) as cnt
FROM openalex.awards.sshrc_awards
WHERE start_year IS NOT NULL
GROUP BY start_year
ORDER BY start_year DESC
LIMIT 20;

In [None]:
%sql
-- Check lead institutions (top 20)
SELECT 
    lead_investigator.affiliation.name as institution,
    COUNT(*) as cnt
FROM openalex.awards.sshrc_awards
WHERE lead_investigator.affiliation.name IS NOT NULL
GROUP BY lead_investigator.affiliation.name
ORDER BY cnt DESC
LIMIT 20;

In [None]:
%sql
-- Check total funding amount
SELECT 
    ROUND(SUM(amount) / 1000000000, 2) as total_funding_billions_cad,
    ROUND(AVG(amount), 0) as avg_award_cad,
    ROUND(PERCENTILE_APPROX(amount, 0.5), 0) as median_award_cad
FROM openalex.awards.sshrc_awards
WHERE amount IS NOT NULL;