In [0]:
from pyspark.sql import functions as F, types as T
import uuid


In [0]:
dbutils.widgets.text("source_root", "/Workspace/Users/lokeshmittal2019@outlook.com/GridFusion-DataHub/data_sources")
dbutils.widgets.dropdown("run_mode", "FULL", ["FULL","INCR"])
dbutils.widgets.dropdown("reset_checkpoints", "false", ["false","true"])
dbutils.widgets.text("db_prefix", "gridfusion")

In [0]:
SOURCE_ROOT = dbutils.widgets.get("source_root").strip()
RUN_MODE = dbutils.widgets.get("run_mode")
RESET_CHECKPOINTS = (dbutils.widgets.get("reset_checkpoints").lower() == "true")
DB_PREFIX = dbutils.widgets.get("db_prefix").strip()

In [0]:
assert SOURCE_ROOT != ""

In [0]:
DB_BRONZE = f"{DB_PREFIX}_bronze"
DB_SILVER = f"{DB_PREFIX}_silver"
DB_GOLD   = f"{DB_PREFIX}_gold"
DB_AUDIT  = f"{DB_PREFIX}_audit"
for db in [DB_BRONZE, DB_SILVER, DB_GOLD, DB_AUDIT]:
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {db}")

In [0]:
DATA_ROOT = f"dbfs:/FileStore/sparkwars/{DB_PREFIX}"
PATH_BRONZE = f"{DATA_ROOT}/bronze"
PATH_SILVER = f"{DATA_ROOT}/silver"
PATH_GOLD   = f"{DATA_ROOT}/gold"
PATH_AUDIT  = f"{DATA_ROOT}/audit"
PATH_CHECKPOINTS = f"{DATA_ROOT}/_checkpoints"
PATH_SCHEMAS     = f"{DATA_ROOT}/_schemas"

In [0]:
print(PATH_BRONZE)

In [0]:
SRC_SMART_METER   = f"file:{SOURCE_ROOT}/smart_meter_kafka"
SRC_SUBSTATION    = f"file:{SOURCE_ROOT}/substation_iot"
SRC_BILLING_CDC   = f"file:{SOURCE_ROOT}/billing_cdc"
SRC_MAINTENANCE   = f"file:{SOURCE_ROOT}/maintenance_csv"
SRC_TRANSFORMERS  = f"file:{SOURCE_ROOT}/transformer_parquet"
SRC_RENEWABLE     = f"file:{SOURCE_ROOT}/renewable_autoloader"


In [0]:
BATCH_ID = str(uuid.uuid4())

def add_ingestion_cols(df, source_system: str):
    return (df
        .withColumn("_ingestion_ts", F.current_timestamp())
        .withColumn("_source_system", F.lit(source_system))
        .withColumn("_batch_id", F.lit(BATCH_ID))
        .withColumn("_source_file", F.input_file_name())
    )

In [0]:
# -----------------------------
# Spark tuning (safe defaults for CE)
# -----------------------------
spark.conf.set("spark.sql.shuffle.partitions", "64")
# spark.conf.set("spark.databricks.delta.optimizeWrite.enabled", "true")
# spark.conf.set("spark.databricks.delta.autoCompact.enabled", "true")

In [0]:
# -----------------------------
# Schemas (explicit contracts)
# -----------------------------

SMART_METER_SCHEMA = T.StructType([
    T.StructField("event_id", T.StringType(), True),
    T.StructField("meter_id", T.StringType(), True),
    T.StructField("timestamp", T.StringType(), True),  # parse later
    T.StructField("reading_type", T.StringType(), True),
    T.StructField("kwh_consumed", T.DoubleType(), True),
    T.StructField("voltage_v", T.DoubleType(), True),
    T.StructField("current_a", T.DoubleType(), True),
    T.StructField("power_factor", T.DoubleType(), True),
    T.StructField("region", T.StringType(), True),
    T.StructField("state", T.StringType(), True),
])

SUBSTATION_SCHEMA = T.StructType([
    T.StructField("telemetry_id", T.StringType(), True),
    T.StructField("substation_id", T.StringType(), True),
    T.StructField("transformer_id", T.StringType(), True),
    T.StructField("timestamp", T.StringType(), True),
    T.StructField("load_pct", T.DoubleType(), True),
    T.StructField("temperature_c", T.DoubleType(), True),
    T.StructField("oil_temperature_c", T.DoubleType(), True),
    T.StructField("ambient_temperature_c", T.DoubleType(), True),
    T.StructField("vibration_mm_s", T.DoubleType(), True),
    T.StructField("dissolved_gas_ppm", T.DoubleType(), True),
    T.StructField("alarm_code", T.StringType(), True),
    T.StructField("region", T.StringType(), True),
    T.StructField("state", T.StringType(), True),
])

In [0]:
def audit_table_metrics(table_fqn: str, layer: str, metric_name: str, metric_value: str):
    df = spark.createDataFrame([(
        BATCH_ID, layer, table_fqn, metric_name, metric_value
    )], schema="batch_id string, layer string, table_fqn string, metric_name string, metric_value string")
    df.write.format("delta").mode("append").saveAsTable(f"{DB_AUDIT}.table_metrics")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {DB_AUDIT}.table_metrics (
  batch_id STRING,
  layer STRING,
  table_fqn STRING,
  metric_name STRING,
  metric_value STRING
) USING DELTA
""")

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {DB_AUDIT}.dq_violations (
  batch_id STRING,
  layer STRING,
  dataset STRING,
  rule_name STRING,
  severity STRING,
  violation_count BIGINT,
  sample_keys STRING,
  created_ts TIMESTAMP
) USING DELTA
""")

In [0]:
%python
spark.sql(f"""select * from {DB_AUDIT}.dq_violations""")