In [0]:
from pyspark.sql import functions as F

CATALOG = "energy_data_platform_project"
GOLD_SCHEMA = f"{CATALOG}.gold"

In [0]:
hourly = spark.table(f"{GOLD_SCHEMA}.hourly_region_usage")

EXPECTED_HOURS_PER_DAY = 24.0

In [0]:
# Aggregate coverage metrics per day / region / substation
coverage = (
    hourly
    .groupBy("obs_date", "year", "region_id", "city", "substation_id")
    .agg(
        F.count("*").alias("actual_hours"),
        F.sum(F.when(F.col("has_weather"), 1).otherwise(0)).alias("hours_with_weather"),
        F.sum(F.when(F.col("has_solar"), 1).otherwise(0)).alias("hours_with_solar"),
        F.sum(F.when(F.col("avg_kw").isNotNull(), 1).otherwise(0)).alias("hours_with_usage"),
    )
)

In [0]:
coverage = (
    coverage
    .withColumn("expected_hours", F.lit(EXPECTED_HOURS_PER_DAY))
    .withColumn(
        "usage_coverage_ratio",
        F.col("hours_with_usage") / F.col("expected_hours")
    )
    .withColumn(
        "weather_coverage_ratio",
        F.col("hours_with_weather") / F.col("expected_hours")
    )
    .withColumn(
        "solar_coverage_ratio",
        F.col("hours_with_solar") / F.col("expected_hours")
    )
)

In [0]:

final_cols = [
    "obs_date",
    "year",
    "region_id",
    "city",
    "substation_id",
    "expected_hours",
    "actual_hours",
    "hours_with_usage",
    "hours_with_weather",
    "hours_with_solar",
    "usage_coverage_ratio",
    "weather_coverage_ratio",
    "solar_coverage_ratio",
]

existing_cols = [c for c in final_cols if c in coverage.columns]
coverage_final = coverage.select(*existing_cols)

(
    coverage_final
    .write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(f"{GOLD_SCHEMA}.data_coverage_summary")
)