In [0]:
# 03_gold_view_encounters_by_month
# Gold KPI – monthly encounter volume and claim cost
# Source: kardia_silver.silver_patient_encounters (append-only)
# Output: kardia_gold.gold_encounters_by_month + vw_encounters_by_month
# Filters out negative claim costs; materialized for performance

from pyspark.sql import SparkSession, functions as F

# Paths and table names
GOLD_DB     = "kardia_gold"
SILVER_TBL  = "kardia_silver.silver_patient_encounters"
GOLD_TBL    = f"{GOLD_DB}.gold_encounters_by_month"
GOLD_VIEW   = "vw_encounters_by_month"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
# Ensure Gold schema exists
spark.sql(f"CREATE DATABASE IF NOT EXISTS {GOLD_DB}")

# Load Silver encounters and build monthly aggregates
df = (
    spark.table(SILVER_TBL)
         .withColumn("month", F.date_format("START", "yyyy-MM"))
         .groupBy("month", "GENDER", "BIRTH_YEAR")
         .agg(
             F.count("*").alias("encounters_n"),
             F.sum("TOTAL_CLAIM_COST").alias("claim_cost")
         )
         .filter(F.col("claim_cost") >= 0)  # soft expectation: no negative costs
)

In [0]:
# Write the Gold table
(
    df.write
      .format("delta")
      .mode("overwrite")
      .option("overwriteSchema", "true")
      .saveAsTable(GOLD_TBL)
)

# Register a view on top for querying/BI
spark.sql(f"""
CREATE OR REPLACE VIEW {GOLD_VIEW} AS
SELECT * FROM {GOLD_TBL}
""")

In [0]:
# Refresh and preview
spark.sql(f"REFRESH TABLE {GOLD_TBL}")
print("vw_encounters_by_month preview:")
display(spark.sql(f"SELECT * FROM {GOLD_VIEW} ORDER BY month DESC, GENDER LIMIT 20"))