In [0]:
# Silver Transform: bronze_patients_changes -> silver_patients  
# - Keeps only `insert` & `update_postimage`  
# - Drops duplicate IDs (last write wins)  
# - Masks `FIRST` & `LAST` (sets to NULL)  
# - Writes a Delta Silver table for downstream analytics

from pyspark.sql import SparkSession, functions as F

In [0]:
# Path config
RAW_PATH = "dbfs:/FileStore/shared_uploads/matthew.databrickslab2@outlook.com/patients_10.csv"
BRONZE_PATH = "dbfs:/kardia/bronze/bronze_patients"
SILVER_PATH = "dbfs:/kardia/silver/silver_patients"

In [0]:
# Initialize a Spark session with minimal shuffle partitions
spark = (
    SparkSession.builder
      .appName("silver_patients")
      .config("spark.sql.shuffle.partitions", "1")
      .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
      .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
      .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

In [0]:
# 1. Remove any previous Silver table
dbutils.fs.rm(SILVER_PATH, recurse=True)

# 2. Read the Bronze Change-Data feed starting at version 1 
# (version 0 is the metadata commit and never has CDF rows)
silver_df = (
    spark.read
         .format("delta")
         .option("readChangeFeed", "true")
         .option("startingVersion", 1)
         .load(BRONZE_PATH)
    .filter(F.col("_change_type").isin("insert", "update_postimage"))
    .dropDuplicates(["ID"])
    .select(
        "ID",
        F.year("BIRTHDATE").cast("int").alias("BIRTH_YEAR"),
        F.lit(None).cast("string").alias("DEATHDATE"),
        F.lit(None).cast("string").alias("FIRST"),
        F.lit(None).cast("string").alias("LAST"),
        F.lit(None).cast("string").alias("MAIDEN"),
        F.lit(None).cast("string").alias("SSN"),
        F.lit(None).cast("string").alias("DRIVERS"),
        F.lit(None).cast("string").alias("PASSPORT"),
        F.lit(None).cast("string").alias("BIRTHPLACE"),
        "MARITAL", "RACE", "ETHNICITY", "GENDER"
    )
)

# 3. Overwrite the Silver table
(silver_df.write
    .format("delta")
    .mode("overwrite")
    .save(SILVER_PATH)
)

print("Silver transform complete")


In [0]:
# Quick verification: row count + sample preview
df_silver = spark.read.format("delta").load(SILVER_PATH)

cnt = df_silver.count()
print(f"Silver table row count: {cnt}")

display(df_silver.limit(5))

In [0]:
## Silver complete  
## Proceed to gold view creation `03_gold/create_gold_views`