In [0]:
# 03_gold_view_encounters_by_month
# Goal: Monthly encounter volume (Gold KPI)
# Source: kardia_silver.silver_encounters_demographics (append only)
# Output: vw_gold_encounters_by_month (view)

# Demo notes
# – Uses a view only; no Delta table is persisted.
# – Full-table scans are fine for small demo data (<10 M rows).

# Production upgrade
# – Write Silver incrementally (MERGE or streaming append with row tracking).
# – Convert this view to a MATERIALIZED VIEW (or STREAMING TABLE) so
#   only new rows are processed each day.

from pyspark.sql import functions as F

# Table paths
GOLD_DB = "kardia_gold"
VIEW_GOLD = "vw_gold_encounters_by_month"

QA_ENC_MISSING_PT = f"{GOLD_DB}.gold_encounters_missing_patient"
QA_PAT_NO_ENC = f"{GOLD_DB}.gold_patients_no_encounter"

SILVER = "kardia_silver.silver_encounters_demographics"

In [0]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {GOLD_DB}")

# 1. Create the BI view
spark.sql(
    f"""
    CREATE OR REPLACE VIEW {VIEW_GOLD} AS
    SELECT date_format(START_TS,'yyyy-MM') AS month,
           COUNT(*)                        AS encounters_n
    FROM   {SILVER}
    WHERE  GENDER IS NOT NULL
      AND  BIRTH_YEAR IS NOT NULL
    GROUP  BY date_format(START_TS,'yyyy-MM')
    """
)

In [None]:
# 2. Build a DataFrame once for the two QA tables
encounters_with_month = (
    spark.table(SILVER)
         .withColumn("month", F.date_format("START_TS", "yyyy-MM"))
)

# Encounters missing patient dims
(encounters_with_month
    .filter("GENDER IS NULL OR BIRTH_YEAR IS NULL")
    .groupBy("month")
    .agg(F.count("*").alias("unmatched_encounters"))

    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema","true")
    .saveAsTable(QA_ENC_MISSING_PT))

In [0]:
# 2b. Patients with no encounter
patients_df = spark.table("kardia_silver.silver_patients")

patients_no_enc = (
    patients_df.alias("p")
        .join( encounters_with_month.select("PatientID").distinct().alias("e"),
               F.col("p.ID") == F.col("e.PatientID"),
               "left_anti")
        .select("p.ID","p.GENDER","p.BIRTH_YEAR")
        .withColumn("as_of_date", F.current_date())
)

(patients_no_enc.write
    .format("delta").mode("overwrite").option("overwriteSchema","true")
    .saveAsTable(QA_PAT_NO_ENC))

In [0]:
# Refresh and preview
for tbl in [VIEW_GOLD, QA_ENC_MISSING_PT, QA_PAT_NO_ENC]:
    spark.sql(f"REFRESH TABLE {tbl}")

display(spark.sql(f"SELECT * FROM {VIEW_GOLD}        ORDER BY month DESC LIMIT 20"))
display(spark.sql(f"SELECT * FROM {QA_ENC_MISSING_PT} ORDER BY month DESC LIMIT 20"))
display(spark.sql(f"SELECT * FROM {QA_PAT_NO_ENC}     ORDER BY as_of_date DESC LIMIT 20"))