In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, StringType
from functools import reduce

In [0]:
# ---- CONFIG ----
SMART_DS_BUCKET = "s3://oedi-data-lake"
SMART_DS_PREFIX = "SMART-DS/v1.0"

YEARS = ["2016"]
CITY = "SFO"
SUBSTATION = "P35U"

CATALOG = "energy_data_platform_project"
SCHEMA = "bronze"
VOLUME = "smartds_raw"

BRONZE_PATH = f"s3://energy-data-platform-project-bucket/etl/bronze/smartds_sfo_load"
BRONZE_TABLE = f"{CATALOG}.{SCHEMA}.smartds_sfo_load"

spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

In [0]:
NUM_COLS = [
    "total_site_electricity_kw",
    "total_site_electricity_kvar",
    "pf",
    "heating_kw", "heating_kvar",
    "cooling_kw", "cooling_kvar",
    "lighting_kw", "lighting_kvar",
    "fans_kw", "fans_kvar",
    "pumps_kw", "pumps_kvar",
    "water_systems_kw", "water_systems_kvar",
    "refrigeration_kw", "refrigeration_kvar",
    "motors_kw", "motors_kvar",
    "plug_loads_kw", "plug_loads_kvar",
    "clothes_dryer_kw", "clothes_dryer_kvar",
    "clothes_washer_kw", "clothes_washer_kvar",
    "stove_kw", "stove_kvar",
    "dishwasher_kw", "dishwasher_kvar",
]

def cast_numeric_columns(df):
    """
    For each column in NUM_COLS:
      - if present, cast to DoubleType
      - if missing in this file, add as NULL DoubleType
    Ensures a consistent numeric schema across all files.
    """
    for c in NUM_COLS:
        if c in df.columns:
            df = df.withColumn(c, F.col(c).cast(DoubleType()))
        else:
            df = df.withColumn(c, F.lit(None).cast(DoubleType()))
    return df


def load_single_parquet(path: str, year: str, city: str, substation_id: str):
    """
    Read a single SMART-DS parquet file, normalize numeric columns,
    and attach Bronze metadata columns.
    """
    df = spark.read.parquet(path)
    df = cast_numeric_columns(df)

    df = (
        df
        # Bronze metadata injected here (no regex needed)
        .withColumn("source_file", F.lit(path))
        .withColumn("city", F.lit(city).cast(StringType()))
        .withColumn("year", F.lit(year).cast(StringType()))
        .withColumn("substation_id", F.lit(substation_id).cast(StringType()))
        .withColumn("ingest_ts", F.current_timestamp())
    )
    return df

In [0]:
# ---- 1. Enumerate only the target substation's load_data parquet files ----
all_dfs = []

for year in YEARS:
    year_root = f"{SMART_DS_BUCKET}/{SMART_DS_PREFIX}/{year}/{CITY}"
    print(f"Listing substations under {year_root}")
    for sub in dbutils.fs.ls(year_root):
        substation_id = sub.name.rstrip("/")   # e.g. "P10U"

        # Filter to one substation if TARGET_SUBSTATION is set
        if SUBSTATION is not None and substation_id != SUBSTATION:
            continue

        load_dir = f"{sub.path}load_data/"
        try:
            files = dbutils.fs.ls(load_dir)
        except Exception as e:
            print(f"Skipping {load_dir}: {e}")
            continue

        parquet_files = [f for f in files if f.name.endswith(".parquet")]
        print(f"  {year} {substation_id}: {len(parquet_files)} parquet files")

        for f in parquet_files:
            df_file = load_single_parquet(f.path, year, CITY, substation_id)
            all_dfs.append(df_file)

print(f"Total files loaded: {len(all_dfs)}")

if not all_dfs:
    raise Exception(
        f"No SMART-DS parquet files found for years={YEARS}, city={CITY}, substation={SUBSTATION}"
    )


In [0]:
# ---- 2. Union all per-file DataFrames into one Bronze DataFrame ----
bronze_df = reduce(
    lambda a, b: a.unionByName(b, allowMissingColumns=True),
    all_dfs
)

print("Bronze row count:", bronze_df.count())
display(bronze_df.limit(5))


In [0]:
# ---- 3. Write Bronze Delta & register table ----
(
    bronze_df
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("year")
    .saveAsTable(BRONZE_TABLE)
)

display(spark.table(BRONZE_TABLE).limit(5))