In [0]:
# 02_silver_encounters_transform.ipynb
# Stream new rows from Bronze (CDF) into Silver.

from pyspark.sql import functions as F

# Paths and table names
BRONZE_TABLE = "kardia_bronze.bronze_encounters"
SILVER_DB = "kardia_silver"
SILVER_PATIENTS_TABLE = f"{SILVER_DB}.silver_encounters"
CHECKPOINT_PATH = "dbfs:/kardia/_checkpoints/silver_encounters"

# Minimize shuffle overhead for small test datasets
spark.conf.set("spark.sql.shuffle.partitions", "1")

In [0]:
# 1A. Ensure the Silver DB exists
spark.sql(f"CREATE DATABASE IF NOT EXISTS {SILVER_DB}")

In [0]:
# 1B. Read from Bronze CDF (SCD Type 1)
bronze_cdf = (
    spark.readStream
        .format("delta")
        .option("readChangeFeed", "true")
        .table(BRONZE_TABLE)
        .filter(F.col("_change_type").isin("insert", "update_postimage"))
)

In [0]:
# 2. Rename and enrich into 10-column Silver schema
silver_ready = (
    bronze_cdf
        .withColumnRenamed("ID", "EncounterID")
        .withColumnRenamed("PATIENT", "PatientID")
        .withColumnRenamed("DATE", "START")
        .withColumn("START_TS", F.to_timestamp("START"))
        .withWatermark("START_TS", "1 day")
        .withColumn("START_DATE", F.to_date("START_TS"))
        .withColumn("STOP", F.lit(None).cast("timestamp"))
        .withColumn("BASE_ENCOUNTER_COST", F.lit(0.0).cast("double"))
        .withColumn("TOTAL_CLAIM_COST", F.lit(0.0).cast("double"))
        .select(
            "EncounterID", "PatientID",
            "START_TS", "STOP", "START_DATE",
            "CODE", "DESCRIPTION",
            "BASE_ENCOUNTER_COST", "TOTAL_CLAIM_COST",
            "REASONCODE", "REASONDESCRIPTION"
        )
)

In [0]:
# 3️. Write to Silver table (append mode)
query = (
    silver_ready.writeStream
        .format("delta")
        .partitionBy("START_DATE")
        .option("checkpointLocation", CHECKPOINT_PATH)
        .option("mergeSchema", "true")
        .outputMode("append")
        .trigger(processingTime="30 seconds")
        .toTable(SILVER_PATIENTS_TABLE)
)

print("Silver-encounters continuous stream started.")
print(query.status)

In [0]:
print(query.status)

In [0]:
# query.stop()