In [0]:
# 02_silver_transform_patients_cdf_mask_pii.ipynb
# -------------------------------------------------------------------
#  Cell-1  = one-time bootstrap
#  Cell-2  = incremental – rerun whenever
#  PHI columns masked to NULL
# -------------------------------------------------------------------

from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
# Cluster-level tweak (run once per cluster)
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
%sql -- One-time FULL snapshot bootstrap
CREATE DATABASE IF NOT EXISTS kardia_silver;

CREATE OR REPLACE TABLE kardia_silver.silver_patients
AS
SELECT
    ID                                       AS ID,
    year(BIRTHDATE)            AS BIRTH_YEAR,
    -- PHI masked
    CAST(NULL AS STRING)       AS DEATHDATE,
    CAST(NULL AS STRING)       AS FIRST,
    CAST(NULL AS STRING)       AS LAST,
    CAST(NULL AS STRING)       AS MAIDEN,
    CAST(NULL AS STRING)       AS SSN,
    CAST(NULL AS STRING)       AS DRIVERS,
    CAST(NULL AS STRING)       AS PASSPORT,
    CAST(NULL AS STRING)       AS BIRTHPLACE,
    MARITAL                    AS MARITAL,
    RACE                       AS RACE,
    ETHNICITY                  AS ETHNICITY,
    GENDER                     AS GENDER
FROM delta.`dbfs:/kardia/bronze/bronze_patients`;



In [0]:
# Incremental refresh via Delta CDF (rerun any time)
bronze_path = "dbfs:/kardia/bronze/bronze_patients"
silver_tbl  = "kardia_silver.silver_patients"

# 1. figure out the latest commit numbers
latest_bronze_ver = (
    spark.sql(f"DESCRIBE HISTORY delta.`{bronze_path}`")
         .selectExpr("max(version) AS v").first().v
)

latest_silver_ver = (
    spark.sql(f"DESCRIBE HISTORY {silver_tbl}")
         .selectExpr("max(version) AS v").first().v
)

# start right after the last Silver commit, but never beyond Bronze
start_ver = min(latest_bronze_ver, latest_silver_ver) + 1

if start_ver > latest_bronze_ver:
    print("Nothing new – Silver already current ✔")
    dbutils.notebook.exit("no_new_commits")

# 2. read just that CDF slice
cdf_df = (spark.read.format("delta")
            .option("readChangeFeed", "true")
            .option("startingVersion", start_ver)
            .load(bronze_path)
            .filter(F.col("_change_type").isin("insert", "update_postimage"))
            .dropDuplicates(["ID"]) # SCD-1 last-write-wins
            .select(
                "ID",
                F.year("BIRTHDATE").cast("int").alias("BIRTH_YEAR"),
                F.lit(None).cast("string").alias("DEATHDATE"),
                F.lit(None).cast("string").alias("FIRST"),
                F.lit(None).cast("string").alias("LAST"),
                F.lit(None).cast("string").alias("MAIDEN"),
                F.lit(None).cast("string").alias("SSN"),
                F.lit(None).cast("string").alias("DRIVERS"),
                F.lit(None).cast("string").alias("PASSPORT"),
                F.lit(None).cast("string").alias("BIRTHPLACE"),
                "MARITAL", "RACE", "ETHNICITY", "GENDER"
            ))

if cdf_df.isEmpty():
    print("Nothing new – Silver already current")
else:
    DeltaTable.forName(spark, silver_tbl).alias("t") \
        .merge(cdf_df.alias("s"), "t.ID = s.ID") \
        .whenMatchedUpdateAll() \
        .whenNotMatchedInsertAll() \
        .execute()

    print("Silver updated → row count:",
          spark.table(silver_tbl).count())


In [0]:
# 5. Persist state and final checks ─────────────────────────────────
dbutils.fs.put(STATE_PATH, str(latest_ver), overwrite=True)
print(f"Silver upsert complete (processed through version {latest_ver})")
cnt = spark.read.format("delta").load(SILVER_PATH).count()
print(f"Silver row count: {cnt}")

# Preview final rows
display(spark.read.format("delta").load(SILVER_PATH).limit(5))

In [0]:
## Silver complete  
## Proceed to gold view creation `03_gold/create_gold_views`

In [0]:
%sql
SELECT * FROM kardia_silver.silver_patients