In [0]:
from pyspark.sql import functions as F

CATALOG = "energy_usage_data_platform"
GOLD_SCHEMA = f"{CATALOG}.gold"

In [0]:
hourly = spark.table(f"{GOLD_SCHEMA}.hourly_region_usage")

In [0]:
# Define temperature bins (you can tune these)
# Example bins: <5, 5–10, 10–15, 15–20, 20–25, 25–30, >=30
temp_bin = (
    F.when(F.col("temp_c") < 5, "<5")
     .when((F.col("temp_c") >= 5) & (F.col("temp_c") < 10), "5-10")
     .when((F.col("temp_c") >= 10) & (F.col("temp_c") < 15), "10-15")
     .when((F.col("temp_c") >= 15) & (F.col("temp_c") < 20), "15-20")
     .when((F.col("temp_c") >= 20) & (F.col("temp_c") < 25), "20-25")
     .when((F.col("temp_c") >= 25) & (F.col("temp_c") < 30), "25-30")
     .otherwise(">=30")
)

binned = (
    hourly
    .filter(F.col("temp_c").isNotNull())
    .withColumn("temp_bin", temp_bin)
)

In [0]:
# Aggregate per region / substation / temp_bin
bin_stats = (
    binned
    .groupBy("region_id", "substation_id", "temp_bin")
    .agg(
        F.count("*").alias("num_hours"),
        F.avg("avg_kw").alias("avg_kw"),
        F.expr("percentile_approx(avg_kw, 0.5)").alias("median_kw"),
        F.expr("percentile_approx(avg_kw, 0.9)").alias("p90_kw"),
        F.avg("temp_c").alias("avg_temp_c"),
    )
)

In [0]:
(
    bin_stats
    .write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(f"{GOLD_SCHEMA}.temp_bin_load_stats")
)

display(spark.table(f"{GOLD_SCHEMA}.temp_bin_load_stats").limit(10))