In [0]:
from pyspark.sql import functions as F

CATALOG = "energy_data_platform_project"
SILVER_SCHEMA = f"{CATALOG}.silver"
GOLD_SCHEMA = f"{CATALOG}.gold"

In [0]:
uw = spark.table(f"{SILVER_SCHEMA}.usage_weather_joined")


In [0]:
hourly = (
    uw
    # Basic coverage flags
    .withColumn(
        "has_weather",
        (
            F.col("temp_c").isNotNull()
            | F.col("dewpoint_c").isNotNull()
            | F.col("wind_speed_ms").isNotNull()
        )
    )
    .withColumn(
        "has_solar",
        (
            F.col("ghi_w_m2").isNotNull()
            | F.col("dni_w_m2").isNotNull()
            | F.col("dhi_w_m2").isNotNull()
        )
    )
)

In [0]:
final_cols = [
    # Time / grain
    "obs_hour_local",
    "obs_hour_utc",
    "obs_date",

    # Region / location
    "region_id",
    "city",
    "substation_id",
    "station_id",
    "solar_site_id",

    # Usage metrics (power)
    "avg_kw",
    "max_kw",
    "min_kw",
    "avg_kvar",
    "max_kvar",
    "min_kvar",
    "avg_pf",
    "usage_sample_count",

    # Usage energy metrics
    "hourly_kwh",
    "hourly_kvarh",

    # Weather metrics
    "temp_c",
    "dewpoint_c",
    "wind_speed_ms",
    "visibility_m",
    "pressure_hpa",
    "precip_mm_total_mm",
    "weather_sample_count",

    # Solar metrics
    "ghi_w_m2",
    "dni_w_m2",
    "dhi_w_m2",
    "solar_sample_count",

    # Coverage flags
    "has_weather",
    "has_solar",
]

existing_cols = [c for c in final_cols if c in hourly.columns]
hourly_final = hourly.select(*existing_cols)


In [0]:
(
    hourly_final
    .write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .partitionBy("obs_date")
    .saveAsTable(f"{GOLD_SCHEMA}.hourly_region_usage")
)

display(spark.table(f"{GOLD_SCHEMA}.hourly_region_usage").limit(10))