In [None]:
# 03_gold_encounters_by_month.ipynb
# GOAL:   Aggregate monthly encounter volume into Gold and refresh QA views.
# SOURCE: kardia_silver.silver_encounters_demographics
# OUTPUT: kardia_gold.gold_encounters_by_month (Delta table)
# TRIGGER: Streaming incremental-batch job with `outputMode("complete")`
#          that fully replaces the output on each run.

# Additional QA tables:
# - gold_qa_enc_missing_dem: monthly count of incomplete records
# - gold_qa_pt_missing_enc: patients with no linked encounters

from pyspark.sql import functions as F, Window
from delta.tables import DeltaTable

# Table paths
GOLD_DB         = "kardia_gold"
SILVER_DEM      = "kardia_silver.silver_encounters_demographics"

GOLD_ENC_BY_MONTH       = f"{GOLD_DB}.gold_encounters_by_month"
GOLD_QA_ENC_MISSING_DEM = f"{GOLD_DB}.gold_encounters_missing_patient"
GOLD_QA_PT_MISSING_ENC  = f"{GOLD_DB}.gold_patients_no_encounter"

CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/gold_encounters_by_month"

In [None]:
# 1. Ensure database and (empty) Gold table exist
spark.sql(f"CREATE DATABASE IF NOT EXISTS {GOLD_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {GOLD_ENC_BY_MONTH} (
        month         STRING,
        encounters_n  BIGINT
    ) USING DELTA
    """
)

In [None]:
# 2. Aggregate monthly encounter counts into Gold.
#   - Prune partitions using START_DATE (last 2 years only)
#   - Filter out incomplete records (missing gender or birth year)
#   - Group by month and count valid encounters
agg_stream = (
    spark.readStream
         .table(SILVER_DEM)
         .filter(F.col("START_DATE") >= F.date_sub(F.current_date(), 730))
         .filter("GENDER IS NOT NULL AND BIRTH_YEAR IS NOT NULL")
         .groupBy(F.date_format("START_TS", "yyyy-MM").alias("month"))
         .agg(F.count("*").alias("encounters_n"))
)

# Write results to Gold.
(agg_stream.writeStream
           .outputMode("complete")
           .option("checkpointLocation", CHECKPOINT_PATH)
           .trigger(availableNow=True)
           .table(GOLD_ENC_BY_MONTH)
           .awaitTermination())

# NOTE:
# - outputMode("complete") replaces the full table each run.
# - Filtering on START_DATE enables static partition pruning.
#   Spark skips entire partition folders (i.e., START_DATE=2013-01-01).
#   The 2-year window updates daily since current_date() is evaluated at job start.

In [0]:
# 3. Refresh QA tables. (Rebuilt on each run)

# 3a. Count encounters missing GENDER or BIRTH_YEAR by month
(spark.table(SILVER_DEM)
      .withColumn("month", F.date_format("START_TS", "yyyy-MM"))
      .filter("GENDER IS NULL OR BIRTH_YEAR IS NULL")
      .groupBy("month")
      .agg(F.count("*").alias("unmatched_encounters"))
      .write
      .mode("overwrite")
      .format("delta")
      .option("overwriteSchema", "true")
      .saveAsTable(GOLD_QA_ENC_MISSING_DEM))

# 3b. Identify patients with no matching encounter records
patients_df   = spark.table("kardia_silver.silver_patients")
encounter_ids = spark.table(SILVER_DEM).select("PatientID").distinct()

(patients_df.alias("p")
            .join(
                encounter_ids.alias("e"),
                F.col("p.ID") == F.col("e.PatientID"),
                "left_anti"
            )
            .select("p.ID", "p.GENDER", "p.BIRTH_YEAR")
            .withColumn("as_of_date", F.current_date())
            .write
            .mode("overwrite")
            .format("delta")
            .option("overwriteSchema", "true")
            .saveAsTable(GOLD_QA_PT_MISSING_ENC))

In [None]:
# 4. Preview sample rows from each Gold and QA table.
for tbl in [GOLD_ENC_BY_MONTH, GOLD_QA_ENC_MISSING_DEM, GOLD_QA_PT_MISSING_ENC]:
    spark.sql(f"REFRESH TABLE {tbl}")
    display(spark.sql(f"SELECT * FROM {tbl} LIMIT 10"))

# NOTE: In production, a foreachBatch and MERGE pattern could be used instead
# of `outputMode("complete")` to incrementally update only changed months.