In [0]:
# 03_gold_claims_by_hour.ipynb
# GOAL 1 : Hourly claim volume & spend  (count, sum, avg)
# GOAL 2 : Provider daily spend + 7‑day rolling totals  (window fn demo)
# GOAL 3 : QA view – hourly provider‑match rate  (no double computation)
#
# SOURCE : kardia_silver.silver_claims_enriched
# OUTPUT :
#          kardia_gold.gold_hourly_claim_metrics
#          kardia_gold.gold_provider_7d_spend
#          kardia_gold.gold_hourly_match_qc_vw   (view)
#
# TRIGGER: Batch job – full snapshot overwrite/MERGE each run

In [0]:
# 0 ▸ Create Gold DB
spark.sql("CREATE DATABASE IF NOT EXISTS kardia_gold")

In [0]:
# 1 ▸ Hourly finance metrics (SQL only)
spark.sql(
    """
    CREATE OR REPLACE TABLE kardia_gold.gold_hourly_claim_metrics AS
    SELECT
        DATE_TRUNC('hour', _ingest_ts)                             AS hour_ts,
        COUNT(*)                                                   AS claim_cnt,
        SUM(ClaimAmount)                                           AS total_amount,
        AVG(ClaimAmount)                                           AS avg_amount,
        SUM(CASE WHEN ProviderSpecialty IS NULL THEN 1 ELSE 0 END) AS unmatched_cnt
    FROM kardia_silver.silver_claims_enriched
    GROUP BY DATE_TRUNC('hour', _ingest_ts)
    """
)

In [0]:
# 2 ▸ Provider daily spend + 7‑day rolling totals (SQL window)
spark.sql(
    """
    CREATE OR REPLACE TABLE kardia_gold.gold_provider_7d_spend AS
    WITH daily AS (
        SELECT
            TO_DATE(ClaimDate) AS claim_date,
            ProviderID,
            SUM(ClaimAmount)   AS daily_amount
        FROM kardia_silver.silver_claims_enriched
        GROUP BY TO_DATE(ClaimDate), ProviderID
    )
    SELECT
        claim_date,
        ProviderID,
        daily_amount,
        SUM(daily_amount) OVER (
            PARTITION BY ProviderID
            ORDER BY     claim_date
            ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
        )                         AS rolling_7d_total,
        AVG(daily_amount) OVER (
            PARTITION BY ProviderID
            ORDER BY     claim_date
            ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
        )                         AS rolling_7d_avg
    FROM daily
    """
)


In [0]:
# 3 ▸ QA view – provider‑match rate (derived, zero extra compute)
spark.sql(
    """
    CREATE OR REPLACE VIEW kardia_gold.gold_hourly_match_qc_vw AS
    SELECT hour_ts,
           claim_cnt,
           unmatched_cnt,
           ROUND(1 - unmatched_cnt / claim_cnt, 4) AS match_rate
    FROM kardia_gold.gold_hourly_claim_metrics
    """
)


In [0]:
# 4 ▸ Preview
print("Hourly finance metrics:")
display(spark.sql(
    """
    SELECT hour_ts,
           claim_cnt,
           ROUND(total_amount,2) AS total_amount,
           ROUND(avg_amount,2)   AS avg_amount
    FROM kardia_gold.gold_hourly_claim_metrics
    ORDER BY hour_ts DESC
    LIMIT 20
    """
))

print("Provider 7‑day rolling spend (latest 10 rows):")
display(spark.sql(
    """
    SELECT *
    FROM kardia_gold.gold_provider_7d_spend
    ORDER BY claim_date DESC, ProviderID
    LIMIT 10
    """
))

print("Provider‑match QA:")
display(spark.sql(
    """
    SELECT hour_ts,
           claim_cnt,
           unmatched_cnt,
           ROUND(match_rate*100,2) AS match_rate_pct
    FROM kardia_gold.gold_hourly_match_qc_vw
    ORDER BY hour_ts DESC
    LIMIT 20
    """
))