In [0]:
# 03_gold_claims_by_hour.ipynb
# GOAL 1 : Hourly claim volume & spend  (count, sum, avg)
# GOAL 2 : Provider daily spend + 7‑day rolling totals  (window fn demo)
# GOAL 3 : QA view – hourly provider‑match rate  (no double computation)
#
# SOURCE : kardia_silver.silver_claims_enriched
# OUTPUT :
#          kardia_gold.gold_hourly_claim_metrics
#          kardia_gold.gold_provider_7d_spend
#          kardia_gold.gold_hourly_match_qc_vw   (view)
#
# TRIGGER: Batch job – full snapshot overwrite/MERGE each run

In [0]:
# 1. Ensure database exists
spark.sql("CREATE DATABASE IF NOT EXISTS kardia_gold")

# 2. Create hourly claim metrics table (CTAS)
spark.sql("""
CREATE OR REPLACE TABLE kardia_gold.gold_hourly_claim_metrics AS
SELECT
    DATE_TRUNC('hour', _ingest_ts)                             AS hour_ts,
    COUNT(*)                                                   AS claim_cnt,
    SUM(ClaimAmount)                                           AS total_amount,
    AVG(ClaimAmount)                                           AS avg_amount,
    SUM(IF(ProviderSpecialty IS NULL, 1, 0))                   AS unmatched_cnt
FROM kardia_silver.silver_claims_enriched
GROUP BY DATE_TRUNC('hour', _ingest_ts)
""")


In [0]:
# Aggregate claim amounts by provider and date,
# then calculate 7-day rolling totals and averages using window functions.
spark.sql("""
CREATE OR REPLACE TABLE kardia_gold.gold_provider_7d_spend AS
SELECT claim_date,
       ProviderID,
       daily_amount,
       SUM(daily_amount) OVER (
           PARTITION BY ProviderID
           ORDER BY claim_date
           ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
       ) AS rolling_7d_total,
       AVG(daily_amount) OVER (
           PARTITION BY ProviderID
           ORDER BY claim_date
           ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
       ) AS rolling_7d_avg
FROM (
    SELECT TO_DATE(ClaimDate) AS claim_date,
           ProviderID,
           SUM(ClaimAmount)   AS daily_amount
    FROM   kardia_silver.silver_claims_enriched
    GROUP  BY claim_date, ProviderID
) d
""")


In [0]:
# Define a QA view that calculates the hourly provider match rate
# by dividing matched claims by total claims from the hourly metrics.
spark.sql("""
CREATE OR REPLACE VIEW kardia_gold.gold_hourly_match_qc_vw AS
SELECT hour_ts,
       claim_cnt,
       unmatched_cnt,
       ROUND(1 - unmatched_cnt / claim_cnt, 4) AS match_rate
FROM   kardia_gold.gold_hourly_claim_metrics
""")


In [0]:
# List all Gold tables / views
display(spark.sql("SHOW TABLES IN kardia_gold"))

# Preview each Gold asset (latest 10 rows)
display(spark.sql("""
  SELECT * FROM kardia_gold.gold_hourly_claim_metrics
  ORDER BY hour_ts DESC LIMIT 10
"""))

display(spark.sql("""
  SELECT * FROM kardia_gold.gold_provider_7d_spend
  ORDER BY claim_date DESC, ProviderID LIMIT 10
"""))

display(spark.sql("""
  SELECT * FROM kardia_gold.gold_hourly_match_qc_vw
  ORDER BY hour_ts DESC LIMIT 10
"""))
