# Guardrails

## Overview

This notebook performs data quality checks after CreateWorksEnriched. If any check fails and the override is not set, the pipeline will fail and block downstream tasks.

## Checks

1. **Author IDs Present in Recently Created Works**: All authorships with an author object must have an author ID
2. **Records Changed in Last Day**: No more than 4 million records should have an `updated_date` in the last day

## Parameters

- `guardrails_override`: Set to `"true"` to skip guardrail failures and allow pipeline to continue

In [None]:
# Get widget parameters
dbutils.widgets.text("guardrails_override", "false")
dbutils.widgets.text("env_suffix", "")

guardrails_override = dbutils.widgets.get("guardrails_override").lower() == "true"
env_suffix = dbutils.widgets.get("env_suffix")

print(f"Guardrails override: {guardrails_override}")
print(f"Environment suffix: '{env_suffix}'")

In [None]:
from datetime import datetime, timedelta

# Track all check results
check_results = []

def record_check(name: str, passed: bool, details: str):
    """Record a guardrail check result."""
    status = "PASSED" if passed else "FAILED"
    check_results.append({"name": name, "passed": passed, "details": details})
    print(f"[{status}] {name}: {details}")

## Check 1: Author IDs Present in Recently Created Works

Verify that all authorships with an author object have an author ID. This check runs on new works (id >= 7 billion) created in the recent batch (last 1 day). Legacy works that are re-surfaced with old IDs are excluded since they have a known limitation with author ID inheritance.

In [None]:
# Check for missing author IDs in recent batch
# A work fails this check if it has an authorship with an author object but no author.id
# Note: We exclude authorships with empty raw_author_name since those can't be matched

missing_author_ids_query = f"""
WITH recent_works AS (
    SELECT id, authorships, created_date
    FROM openalex{env_suffix}.works.openalex_works
    WHERE id >= 7000000000
      AND created_date >= current_date() - INTERVAL 1 DAY
      AND authorships IS NOT NULL
      AND SIZE(authorships) > 0
),
works_with_missing_author_ids AS (
    SELECT 
        id,
        SIZE(authorships) as total_authorships,
        SIZE(FILTER(authorships, a -> 
            a.author IS NOT NULL 
            AND a.raw_author_name IS NOT NULL 
            AND TRIM(a.raw_author_name) != ''
            AND (a.author.id IS NULL OR a.author.id = '')
        )) as missing_author_id_count
    FROM recent_works
)
SELECT 
    COUNT(*) as total_recent_works,
    SUM(CASE WHEN missing_author_id_count > 0 THEN 1 ELSE 0 END) as works_with_missing_author_ids,
    SUM(missing_author_id_count) as total_missing_author_ids,
    SUM(total_authorships) as total_authorships_checked
FROM works_with_missing_author_ids
"""

result = spark.sql(missing_author_ids_query).collect()[0]

total_recent_works = result["total_recent_works"]
works_with_missing = result["works_with_missing_author_ids"]
total_missing = result["total_missing_author_ids"]
total_authorships = result["total_authorships_checked"]

print(f"Recent works checked: {total_recent_works:,}")
print(f"Total authorships checked: {total_authorships:,}")
print(f"Works with missing author IDs: {works_with_missing:,}")
print(f"Total missing author IDs: {total_missing:,}")

# Calculate percentage
if total_authorships and total_authorships > 0:
    missing_pct = (total_missing / total_authorships) * 100
else:
    missing_pct = 0

# Check passes if no missing author IDs (0 tolerance)
check_passed = (works_with_missing == 0 or works_with_missing is None)

record_check(
    "Author IDs Present in Recently Created Works",
    check_passed,
    f"{works_with_missing:,} works have authorships with missing author IDs ({missing_pct:.4f}% of authorships)"
)

In [None]:
# Show sample of works with missing author IDs for debugging (if any)
if works_with_missing and works_with_missing > 0:
    sample_query = f"""
    WITH recent_works AS (
        SELECT id, authorships, created_date, title
        FROM openalex{env_suffix}.works.openalex_works
        WHERE id >= 7000000000
      AND created_date >= current_date() - INTERVAL 1 DAY
          AND authorships IS NOT NULL
          AND SIZE(authorships) > 0
    )
    SELECT 
        id,
        title,
        SIZE(authorships) as total_authorships,
        SIZE(FILTER(authorships, a -> 
            a.author IS NOT NULL 
            AND a.raw_author_name IS NOT NULL 
            AND TRIM(a.raw_author_name) != ''
            AND (a.author.id IS NULL OR a.author.id = '')
        )) as missing_author_id_count,
        TRANSFORM(
            FILTER(authorships, a -> 
                a.author IS NOT NULL 
                AND a.raw_author_name IS NOT NULL 
                AND TRIM(a.raw_author_name) != ''
                AND (a.author.id IS NULL OR a.author.id = '')
            ),
            a -> a.raw_author_name
        ) as authors_missing_ids
    FROM recent_works
    WHERE SIZE(FILTER(authorships, a -> 
        a.author IS NOT NULL 
        AND a.raw_author_name IS NOT NULL 
        AND TRIM(a.raw_author_name) != ''
        AND (a.author.id IS NULL OR a.author.id = '')
    )) > 0
    LIMIT 10
    """
    print("\nSample works with missing author IDs:")
    display(spark.sql(sample_query))

## Check 2: Records Changed in Last Day

Verify that no more than 4 million records have an `updated_date` in the last day. A spike above this threshold may indicate an unintended bulk update.

In [None]:
MAX_RECORDS_CHANGED = 4_000_000

records_changed_query = f"""
SELECT COUNT(*) as records_changed
FROM openalex{env_suffix}.works.openalex_works
WHERE updated_date >= current_date() - INTERVAL 1 DAY
"""

result = spark.sql(records_changed_query).collect()[0]
records_changed = result["records_changed"]

print(f"Records changed in last day: {records_changed:,}")
print(f"Threshold: {MAX_RECORDS_CHANGED:,}")

check_passed = records_changed <= MAX_RECORDS_CHANGED

record_check(
    "Records Changed in Last Day",
    check_passed,
    f"{records_changed:,} records changed (threshold: {MAX_RECORDS_CHANGED:,})"
)

## Summary and Final Decision

In [None]:
# Summarize all check results
print("="*60)
print("GUARDRAILS SUMMARY")
print("="*60)

failed_checks = [c for c in check_results if not c["passed"]]
passed_checks = [c for c in check_results if c["passed"]]

print(f"\nTotal checks: {len(check_results)}")
print(f"Passed: {len(passed_checks)}")
print(f"Failed: {len(failed_checks)}")

if failed_checks:
    print("\nFailed checks:")
    for check in failed_checks:
        print(f"  - {check['name']}: {check['details']}")

print(f"\nOverride enabled: {guardrails_override}")

# Determine final outcome
if failed_checks and not guardrails_override:
    print("\n" + "="*60)
    print("GUARDRAILS FAILED - Pipeline will be blocked")
    print("Set guardrails_override=true to bypass (use with caution)")
    print("="*60)
    raise Exception(f"Guardrails failed: {len(failed_checks)} check(s) failed. Set guardrails_override=true to bypass.")
elif failed_checks and guardrails_override:
    print("\n" + "="*60)
    print("WARNING: Guardrails failed but override is enabled")
    print("Pipeline will continue - review failed checks!")
    print("="*60)
else:
    print("\n" + "="*60)
    print("GUARDRAILS PASSED - Pipeline may proceed")
    print("="*60)