In [0]:
# 03_gold_encounters_by_month.ipynb
# GOAL 1 :  Monthly encounter volume   (Gold fact table)
# GOAL 2 :  QA view – encounters with missing demo fields
# GOAL 3 :  QA view – patients with no encounters
#
# SOURCE  :  kardia_silver.silver_encounters_demographics
# OUTPUT  :  kardia_gold.gold_encounters_by_month         (table)
#            kardia_gold.gold_enc_missing_demo_vw         (view)
#            kardia_gold.gold_patients_no_enc_vw          (view)
#
# TRIGGER :  Simple batch – full snapshot overwrite each run
#            (For production you could switch to foreachBatch + MERGE.)

# 1. Ensure Gold database
spark.sql("CREATE DATABASE IF NOT EXISTS kardia_gold")

In [0]:
# 2. Monthly encounter counts  (pure SQL overwrite)
spark.sql(
    """
    CREATE OR REPLACE TABLE kardia_gold.gold_encounters_by_month AS
    SELECT date_format(START_TS, 'yyyy-MM') AS month,
           COUNT(*)                         AS encounters_n
    FROM kardia_silver.silver_encounters_demographics
    WHERE GENDER IS NOT NULL
    AND BIRTH_YEAR IS NOT NULL
    GROUP BY date_format(START_TS, 'yyyy-MM')
    """
)

In [0]:
# 3. QA view – encounters missing demographic fields
spark.sql(
    """
    CREATE OR REPLACE VIEW kardia_gold.gold_enc_missing_demo_vw AS
    SELECT date_format(START_TS, 'yyyy-MM') AS month,
           COUNT(*)                         AS unmatched_encounters
    FROM kardia_silver.silver_encounters_demographics
    WHERE GENDER IS NULL OR BIRTH_YEAR IS NULL
    GROUP BY date_format(START_TS, 'yyyy-MM')
    """
)

In [0]:
# 4. QA view – patients with no encounter records
spark.sql(
    """
    CREATE OR REPLACE VIEW kardia_gold.gold_patients_no_enc_vw AS
    SELECT p.ID,
           p.GENDER,
           p.BIRTH_YEAR,
           current_date() AS as_of_date
    FROM kardia_silver.silver_patients AS p
    LEFT ANTI JOIN (
        SELECT DISTINCT PatientID
        FROM kardia_silver.silver_encounters_demographics
    ) AS e
    ON p.ID = e.PatientID
    """
)

In [0]:
# 5. Preview
print("Monthly encounter counts:")
display(spark.sql(
    """
    SELECT *
    FROM kardia_gold.gold_encounters_by_month
    ORDER BY month DESC
    LIMIT 12
    """
))

print("Monthly encounters missing demo fields (QA):")
display(spark.sql(
    """
    SELECT *
    FROM kardia_gold.gold_enc_missing_demo_vw
    ORDER BY month DESC
    LIMIT 12
    """
))

print("Patients with no encounters (QA):")
display(spark.sql(
    """
    SELECT *
    FROM kardia_gold.gold_patients_no_enc_vw
    LIMIT 10
    """
))

# NOTE:
# In production, we could keep the Gold fact table incremental by switching
# to foreachBatch + MERGE (instead of overwrite) or by storing only changed
# months. For this demo dataset, a full overwrite is simple and fast.