In [0]:
from pyspark.sql import functions as F

CATALOG = "energy_usage_data_platform"
SILVER_SCHEMA = f"{CATALOG}.silver"
GOLD_SCHEMA = f"{CATALOG}.gold"

In [0]:
uw = spark.table(f"{SILVER_SCHEMA}.usage_weather_joined")


In [0]:
EXPECTED_HOURS_PER_DAY = 24.0

daily = (
    uw
    .groupBy(
        "obs_date",
        "year",
        "region_id",
        "city",
        "substation_id",
    )
    .agg(
        # Count of hourly rows
        F.count("*").alias("num_hours"),

        # Energy (kWh / kVARh)
        F.sum("hourly_kwh").alias("daily_kwh_total"),
        F.sum("hourly_kvarh").alias("daily_kvarh_total"),

        # Demand (kW / kVAR)
        F.max("max_kw").alias("daily_kw_peak"),
        F.min("min_kw").alias("daily_kw_min"),
        F.avg("avg_kw").alias("daily_kw_avg"),

        F.max("max_kvar").alias("daily_kvar_peak"),
        F.min("min_kvar").alias("daily_kvar_min"),
        F.avg("avg_kvar").alias("daily_kvar_avg"),

        # Power factor
        F.avg("avg_pf").alias("avg_daily_pf"),

        # Weather summaries
        F.avg("temp_c").alias("avg_daily_temp_c"),
        F.max("temp_c").alias("max_daily_temp_c"),
        F.min("temp_c").alias("min_daily_temp_c"),
        F.avg("dewpoint_c").alias("avg_daily_dewpoint_c"),
        F.avg("wind_speed_ms").alias("avg_daily_wind_speed_ms"),
        F.sum("precip_mm_total_mm").alias("daily_precip_mm"),

        # Solar summaries
        F.avg("ghi_w_m2").alias("avg_daily_ghi_w_m2"),
        F.avg("dni_w_m2").alias("avg_daily_dni_w_m2"),
        F.avg("dhi_w_m2").alias("avg_daily_dhi_w_m2"),
    )
)

In [0]:
daily = (
    daily
    # Coverage as fraction of expected hours
    .withColumn(
        "usage_coverage",
        F.col("num_hours") / F.lit(EXPECTED_HOURS_PER_DAY)
    )
    # Load factor based on actual hours
    .withColumn(
        "load_factor",
        F.when(
            (F.col("daily_kw_peak") > 0) & (F.col("num_hours") > 0),
            F.col("daily_kwh_total") / (F.col("num_hours") * F.col("daily_kw_peak"))
        ).otherwise(F.lit(None))
    )
)

In [0]:
final_cols = [
    # Date / calendar
    "obs_date",
    "year",
    
    # Location
    "region_id",
    "city",
    "substation_id",

    # Usage counts & coverage
    "num_hours",
    "usage_coverage",

    # Energy (kWh / kVARh)
    "daily_kwh_total",
    "daily_kvarh_total",

    # Demand (kW / kVAR)
    "daily_kw_peak",
    "daily_kw_min",
    "daily_kw_avg",
    "daily_kvar_peak",
    "daily_kvar_min",
    "daily_kvar_avg",

    # Power factor & load factor
    "avg_daily_pf",
    "load_factor",

    # Weather
    "avg_daily_temp_c",
    "max_daily_temp_c",
    "min_daily_temp_c",
    "avg_daily_dewpoint_c",
    "avg_daily_wind_speed_ms",
    "daily_precip_mm",

    # Solar
    "avg_daily_ghi_w_m2",
    "avg_daily_dni_w_m2",
    "avg_daily_dhi_w_m2",
]

existing_cols = [c for c in final_cols if c in daily.columns]
daily_final = daily.select(*existing_cols)

In [0]:
(
    daily_final
    .write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .partitionBy("obs_date")
    .saveAsTable(f"{GOLD_SCHEMA}.daily_region_usage")
)

In [0]:
display(spark.table(f"{GOLD_SCHEMA}.daily_region_usage").orderBy("obs_date").limit(100))
spark.table(f"{GOLD_SCHEMA}.daily_region_usage").groupBy("year").agg(F.sum("daily_kwh_total")).show()