In [0]:
from pyspark.sql import functions as F

CATALOG = "energy_usage_data_platform"
GOLD_SCHEMA = f"{CATALOG}.gold"

In [0]:
hourly = spark.table(f"{GOLD_SCHEMA}.hourly_region_usage")
dim_date = spark.table(f"{GOLD_SCHEMA}.dim_date")
dim_month_season = spark.table(f"{GOLD_SCHEMA}.dim_month_season")


In [0]:
# Join hourly fact with date dimension on obs_date
h_d = (
    hourly.alias("h")
    .join(
        dim_date.alias("d"),
        F.col("h.obs_date") == F.col("d.date_key"),
        how="left"
    )
)

In [0]:
# Join in season via month -> dim_month_season
h_d_s = (
    h_d
    .join(
        dim_month_season.alias("m"),
        F.col("d.month") == F.col("m.month"),
        how="left"
    )
)

In [0]:
# Ensure hour_of_day exists (recompute from obs_hour_local if needed)
h_d_s = h_d_s.withColumn(
    "hour_of_day",
    F.hour("h.obs_hour_local")
)

In [0]:
# Segment: season (from dim_month_season) + weekday/weekend (from dim_date)
h_d_s = h_d_s.withColumn(
    "segment",
    F.concat_ws(
        "_",
        F.col("m.season"),
        F.when(F.col("d.is_weekend"), F.lit("weekend")).otherwise(F.lit("weekday"))
    )
)

In [0]:
profiles = (
    h_d_s
    .groupBy(
        F.col("h.region_id").alias("region_id"),
        F.col("h.substation_id").alias("substation_id"),
        F.col("segment"),
        F.col("hour_of_day"),
    )
    .agg(
        F.count("*").alias("num_hours"),
        F.avg("h.avg_kw").alias("avg_kw"),
        F.expr("percentile_approx(h.avg_kw, 0.9)").alias("p90_kw"),
        F.avg("h.temp_c").alias("avg_temp_c"),
        F.avg("h.ghi_w_m2").alias("avg_ghi_w_m2"),
    )
)

In [0]:
(
    profiles
    .write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(f"{GOLD_SCHEMA}.hourly_load_profile_by_segment")
)
display(spark.table(f"{GOLD_SCHEMA}.hourly_load_profile_by_segment").limit(10))