In [0]:
# 03_gold_gender_breakdown
# GOAL:   Patient count by gender
# SOURCE: kardia_silver.silver_patients
# OUTPUT: kardia_gold.gold_gender_breakdown (table)
# TRIGGER: Single batch job that computes the latest gender counts
#          and MERGEs them into the Gold table (Type-1 overwrite).

from pyspark.sql import functions as F
from delta.tables import DeltaTable

# Table paths
GOLD_DB                  = "kardia_gold"
SILVER_PATIENT           = "kardia_silver.silver_patients"
GOLD_GENDER_BREAKDOWN    = f"{GOLD_DB}.gold_gender_breakdown"

In [0]:
# 1. Ensure the Gold database and output table exist.
spark.sql(f"CREATE DATABASE IF NOT EXISTS {GOLD_DB}")

spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {GOLD_GENDER_BREAKDOWN} (
      GENDER      STRING,
      patient_cnt BIGINT
    ) USING DELTA
    """
)

In [0]:
# 2. Compute current gender counts (exclude null values).
gender_counts = (
    spark.table(SILVER_PATIENT)
         .filter("GENDER IS NOT NULL")
         .groupBy("GENDER")
         .agg(F.count("*").alias("patient_cnt"))
)

In [0]:
# 3. Upsert counts into the Gold table.
(DeltaTable.forName(spark, GOLD_GENDER_BREAKDOWN)
           .alias("target")
           .merge(gender_counts.alias("source"), "target.GENDER = source.GENDER")
           .whenMatchedUpdateAll()
           .whenNotMatchedInsertAll()
           .execute())

In [0]:
# 4. Preview results.
spark.sql(f"REFRESH TABLE {GOLD_GENDER_BREAKDOWN}")
display(
    spark.sql(f"SELECT * FROM {GOLD_GENDER_BREAKDOWN} ORDER BY patient_cnt DESC")
)