In [0]:
%run ./01_config


In [0]:
from pyspark.sql import functions as F


In [0]:
def rm_dbfs(path):
    try:
        dbutils.fs.rm(path, recurse=True)
    except:
        pass

if RESET_CHECKPOINTS:
    rm_dbfs(PATH_CHECKPOINTS)
    rm_dbfs(PATH_SCHEMAS)


In [0]:
smart_meter_bronze_path = f"{PATH_BRONZE}/smart_meter_events"
smart_meter_ckpt = f"{PATH_CHECKPOINTS}/smart_meter"
df_sm_stream = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "json")
    .schema(SMART_METER_SCHEMA)
    .load(f"{SRC_SMART_METER}/smart_meter_stream.json")
)
df_sm_bronze = add_ingestion_cols(df_sm_stream, "smart_meter")

In [0]:
(sm_write := df_sm_bronze.writeStream
    .format("delta")
    .option("checkpointLocation", smart_meter_ckpt)
    .outputMode("append")
    .trigger(availableNow=True)
    .start(smart_meter_bronze_path)
)
sm_write.awaitTermination()

In [0]:

spark.sql(f"CREATE TABLE IF NOT EXISTS {DB_BRONZE}.smart_meter_events USING DELTA LOCATION '{smart_meter_bronze_path}'")
audit_table_metrics(f"{DB_BRONZE}.smart_meter_events", "BRONZE", "status", "loaded")


In [0]:
# ----------------------------
# 2) Substation Telemetry — file stream
# ----------------------------
telemetry_bronze_path = f"{PATH_BRONZE}/substation_telemetry_events"
telemetry_ckpt = f"{PATH_CHECKPOINTS}/substation_telemetry"

df_tel_stream = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "json")
    .schema(SUBSTATION_SCHEMA)
    .load(f"{SRC_SUBSTATION}/substation_telemetry_stream.json")
)
df_tel_bronze = add_ingestion_cols(df_tel_stream, "substation_telemetry")

(tel_write := df_tel_bronze.writeStream
    .format("delta")
    .option("checkpointLocation", telemetry_ckpt)
    .outputMode("append")
    .trigger(availableNow=True)
    .start(telemetry_bronze_path)
)
tel_write.awaitTermination()

spark.sql(f"CREATE TABLE IF NOT EXISTS {DB_BRONZE}.substation_telemetry_events USING DELTA LOCATION '{telemetry_bronze_path}'")
audit_table_metrics(f"{DB_BRONZE}.substation_telemetry_events", "BRONZE", "status", "loaded")


In [0]:
# ----------------------------
# 3) Billing CDC — batch to bronze (raw events)
# CDC feed is line-delimited JSON with __$operation :contentReference[oaicite:6]{index=6}
# ----------------------------
billing_bronze_path = f"{PATH_BRONZE}/billing_cdc_events"
df_billing_raw = spark.read.json(f"{SRC_BILLING_CDC}/billing_cdc_feed.json")
df_billing_bronze = add_ingestion_cols(df_billing_raw, "billing_cdc")

(df_billing_bronze.write.format("delta")
  .mode("append" if RUN_MODE == "INCR" else "overwrite")
  .save(billing_bronze_path)
)
spark.sql(f"CREATE TABLE IF NOT EXISTS {DB_BRONZE}.billing_cdc_events USING DELTA LOCATION '{billing_bronze_path}'")
audit_table_metrics(f"{DB_BRONZE}.billing_cdc_events", "BRONZE", "status", "loaded")


In [0]:
# ----------------------------
# 4) Maintenance CSV — batch (3 monthly files) :contentReference[oaicite:7]{index=7}
# ----------------------------
maintenance_bronze_path = f"{PATH_BRONZE}/maintenance_logs"
df_maint_raw = (spark.read
    .format("csv").option("header", True).option("inferSchema", True)
    .load(f"{SRC_MAINTENANCE}/*.csv")
)
df_maint_bronze = add_ingestion_cols(df_maint_raw, "maintenance_csv")

(df_maint_bronze.write.format("delta")
  .mode("append" if RUN_MODE == "INCR" else "overwrite")
  .save(maintenance_bronze_path)
)
spark.sql(f"CREATE TABLE IF NOT EXISTS {DB_BRONZE}.maintenance_logs USING DELTA LOCATION '{maintenance_bronze_path}'")
audit_table_metrics(f"{DB_BRONZE}.maintenance_logs", "BRONZE", "status", "loaded")


In [0]:
# ----------------------------
# 5) Transformer + Customer Parquet (batch snapshots + delta)
# Transformer is SCD2 snapshot + delta :contentReference[oaicite:8]{index=8}
# Customer is SCD2 snapshot :contentReference[oaicite:9]{index=9}
# ----------------------------
trf_snap_path = f"{PATH_BRONZE}/transformer_master_snapshot"
trf_delta_path = f"{PATH_BRONZE}/transformer_master_delta"
cust_snap_path = f"{PATH_BRONZE}/customer_master_snapshot"

df_trf = spark.read.parquet(f"{SRC_TRANSFORMERS}/transformer_master.parquet")
df_trf_delta = spark.read.parquet(f"{SRC_TRANSFORMERS}/transformer_master_delta_20250601.parquet")
df_cust = spark.read.parquet(f"{SRC_TRANSFORMERS}/customer_master.parquet")

add_ingestion_cols(df_trf, "transformer_master").write.format("delta").mode("overwrite").save(trf_snap_path)
add_ingestion_cols(df_trf_delta, "transformer_master_delta").write.format("delta").mode("overwrite").save(trf_delta_path)
add_ingestion_cols(df_cust, "customer_master").write.format("delta").mode("overwrite").save(cust_snap_path)

spark.sql(f"CREATE TABLE IF NOT EXISTS {DB_BRONZE}.transformer_master_snapshot USING DELTA LOCATION '{trf_snap_path}'")
spark.sql(f"CREATE TABLE IF NOT EXISTS {DB_BRONZE}.transformer_master_delta USING DELTA LOCATION '{trf_delta_path}'")
spark.sql(f"CREATE TABLE IF NOT EXISTS {DB_BRONZE}.customer_master_snapshot USING DELTA LOCATION '{cust_snap_path}'")

audit_table_metrics(f"{DB_BRONZE}.transformer_master_snapshot", "BRONZE", "status", "loaded")
audit_table_metrics(f"{DB_BRONZE}.transformer_master_delta", "BRONZE", "status", "loaded")
audit_table_metrics(f"{DB_BRONZE}.customer_master_snapshot", "BRONZE", "status", "loaded")


In [0]:
# ----------------------------
# 6) Renewable Production — Auto Loader schema evolution :contentReference[oaicite:10]{index=10}
# ----------------------------
renew_bronze_path = f"{PATH_BRONZE}/renewable_production"
renew_ckpt = f"{PATH_CHECKPOINTS}/renewable"
renew_schema_loc = f"{PATH_SCHEMAS}/renewable"

df_renew = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("cloudFiles.inferColumnTypes", True)
    .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
    .option("cloudFiles.schemaLocation", renew_schema_loc)
    .option("cloudFiles.schemaHints", "production_mw double, curtailment_mw double, battery_storage_mwh double")
    .load(f"{SRC_RENEWABLE}/*.json")
)
df_renew_bronze = add_ingestion_cols(df_renew, "renewable_production")

(renew_write := df_renew_bronze.writeStream
    .format("delta")
    .option("checkpointLocation", renew_ckpt)
    .outputMode("append")
    .trigger(availableNow=True)
    .start(renew_bronze_path)
)
renew_write.awaitTermination()

spark.sql(f"CREATE TABLE IF NOT EXISTS {DB_BRONZE}.renewable_production USING DELTA LOCATION '{renew_bronze_path}'")
audit_table_metrics(f"{DB_BRONZE}.renewable_production", "BRONZE", "status", "loaded")

# Basic rowcount metrics
for t in ["smart_meter_events","substation_telemetry_events","billing_cdc_events","maintenance_logs","transformer_master_snapshot","transformer_master_delta","customer_master_snapshot","renewable_production"]:
    cnt = spark.table(f"{DB_BRONZE}.{t}").count()
    audit_table_metrics(f"{DB_BRONZE}.{t}", "BRONZE", "rowcount", str(cnt))