In [0]:
from pyspark.sql import functions as F, Window

catalog = "energy_usage_data_platform"
bronze_schema = f"{catalog}.bronze"
silver_schema = f"{catalog}.silver"


In [0]:
bronze_df = spark.table(f"{bronze_schema}.smartds_sfo_load")

In [0]:
# Parse local time; adjust format if needed
df = (
    bronze_df
    .withColumn("obs_time_local", F.to_timestamp("Time", "yyyy-MM-dd HH:mm:ss:SS"))  # assume local SFO time
    .withColumn(
        "obs_time_utc",
        F.to_utc_timestamp("obs_time_local", "America/Los_Angeles")
    )
    .withColumn("obs_date", F.to_date("obs_time_local"))
    .withColumn("year", F.year("obs_date").cast("int"))
    .withColumn("building_id", F.col("building").cast("string"))
    .withColumnRenamed("total_site_electricity_kw", "total_kw")
    .withColumnRenamed("total_site_electricity_kvar", "total_kvar")
)

# Only keep columns a utility would realistically see
df = df.select(
    "obs_time_local",
    "obs_time_utc",
    "obs_date",
    "year",
    "city",
    "substation_id",
    "building_id",
    "total_kw",
    "total_kvar",
    "pf",
    "source_file",
    "ingest_ts",
)

# Basic QC: drop impossible values
df = (
    df
    .filter(F.col("obs_time_utc").isNotNull())
    .filter(F.col("total_kw") >= 0)
    .filter((F.col("pf") >= -1.1) & (F.col("pf") <= 1.1))
)

# Deduplicate (most recent ingest per substation/building/timestamp)
w = Window.partitionBy("substation_id", "building_id", "obs_time_utc").orderBy(
    F.col("ingest_ts").desc()
)

df = (
    df
    .withColumn("rn", F.row_number().over(w))
    .filter(F.col("rn") == 1)
    .drop("rn")
)


In [0]:
(
    df
    .write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .partitionBy("obs_date")
    .saveAsTable(f"{silver_schema}.smartds_usage_clean")
)

display(spark.table(f"{silver_schema}.smartds_usage_clean").limit(5))