In [0]:
# 03_gold_view_gender_breakdown_kpi
# Gold KPI – patient count by gender
# Source: kardia_silver.silver_patients
# Output: kardia_gold.gold_gender_breakdown + vw_gender_breakdown
# Excludes null gender; materialized for performance

from pyspark.sql import SparkSession, functions as F

# Paths and table names
GOLD_DB     = "kardia_gold"
SILVER_TBL  = "kardia_silver.silver_patients"
GOLD_TBL    = f"{GOLD_DB}.gold_gender_breakdown"
GOLD_VIEW   = "vw_gender_breakdown"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
# Create Gold schema if missing
spark.sql(f"CREATE DATABASE IF NOT EXISTS {GOLD_DB}")

# Read from Silver and filter bad values early
df = (
    spark.table(SILVER_TBL)
         .filter(F.col("GENDER").isNotNull())
         .groupBy("GENDER")
         .agg(F.count("*").alias("cnt"))
)

valid_df = (
    df.filter(F.col("cnt") > 0)
)

In [0]:
# Write to Delta as a materialized Gold table
(
    valid_df.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(GOLD_TBL)
)

# Create a view for BI tools or quick querying
spark.sql(f"""
CREATE OR REPLACE VIEW {GOLD_VIEW} AS
SELECT * FROM {GOLD_TBL}
""")

In [0]:
# Quick sanity check
spark.sql(f"REFRESH TABLE {GOLD_TBL}")
print("vw_gender_breakdown preview:")
display(spark.sql(f"SELECT * FROM {GOLD_VIEW}"))