In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Catalog / schema config
CATALOG = "energy_data_platform_project"
BRONZE_SCHEMA = f"{CATALOG}.bronze"
SILVER_SCHEMA = f"{CATALOG}.silver"

In [0]:
usage_silver = spark.table(f"{SILVER_SCHEMA}.smartds_usage_clean")
weather_silver = spark.table(f"{SILVER_SCHEMA}.noaa_isd_ksfo_clean")
solar_silver = spark.table(f"{SILVER_SCHEMA}.ca_solar_irradiance_clean")

In [0]:
usage_hourly = (
    usage_silver
    .withColumn("obs_hour_local", F.date_trunc("hour", F.col("obs_time_local")))
    .groupBy("city", "substation_id", "obs_hour_local")
    .agg(
        # Real power
        F.avg("total_kw").alias("avg_kw"),
        F.max("total_kw").alias("max_kw"),
        F.min("total_kw").alias("min_kw"),

        # Reactive power
        F.avg("total_kvar").alias("avg_kvar"),
        F.max("total_kvar").alias("max_kvar"),
        F.min("total_kvar").alias("min_kvar"),

        # Power factor
        F.avg("pf").alias("avg_pf"),

        # Row count in this hour
        F.count("*").alias("usage_sample_count"),
    )
)

usage_hourly = (
    usage_hourly
    .withColumn("hourly_kwh", F.col("avg_kw") * F.lit(1.0))     # kW * h
    .withColumn("hourly_kvarh", F.col("avg_kvar") * F.lit(1.0)) # kVAR * h
)

In [0]:
weather_hourly = (
    weather_silver
    .withColumn("obs_hour_local", F.date_trunc("hour", F.col("obs_time_local")))
    .groupBy("station_id", "obs_hour_local")
    .agg(
        # Instantaneous-like variables: average is appropriate
        F.avg("temperature_c").alias("temp_c"),
        F.avg("dewpoint_c").alias("dewpoint_c"),
        F.avg("wind_speed_ms").alias("wind_speed_ms"),
        F.avg("visibility_m").alias("visibility_m"),
        F.avg("pressure_hpa").alias("pressure_hpa"),

        # Accumulated over the hour: sum, not avg
        # (once precip_mm is populated from ISD variable sections)
        F.sum("precip_mm").alias("precip_mm_total_mm"),

        F.count("*").alias("weather_sample_count"),
    )
)


In [0]:
solar_hourly = (
    solar_silver
    .withColumn("obs_hour_local", F.date_trunc("hour", F.col("obs_time_local")))
    .groupBy("solar_site_id", "obs_hour_local")
    .agg(
        F.avg("ghi_w_m2").alias("ghi_w_m2"),
        F.avg("dni_w_m2").alias("dni_w_m2"),
        F.avg("dhi_w_m2").alias("dhi_w_m2"),
        F.count("*").alias("solar_sample_count"),
    )
)


In [0]:
joined = (
    usage_hourly.alias("u")
    .join(weather_hourly.alias("w"), on="obs_hour_local", how="left")
    .join(solar_hourly.alias("s"), on="obs_hour_local", how="left")
)

# Enrich with derived time + region fields
joined = (
    joined
    .withColumn(
        "obs_hour_utc",
        F.to_utc_timestamp("obs_hour_local", "America/Los_Angeles")
    )
    .withColumn("obs_date", F.to_date("obs_hour_local"))
    .withColumn("year", F.year("obs_date").cast("int"))
    .withColumn("region_id", F.lit("SFO"))
)

In [0]:
final_cols = [
    # Time / grain
    "obs_hour_local",
    "obs_hour_utc",
    "obs_date",
    "year",

    # Region / location
    "region_id",
    "city",
    "substation_id",
    "station_id",
    "solar_site_id",

    # Usage metrics
    "avg_kw",
    "max_kw",
    "min_kw",
    "avg_kvar",
    "max_kvar",
    "min_kvar",
    "hourly_kwh",
    "hourly_kvarh",
    "avg_pf",
    "usage_sample_count",

    # Weather metrics
    "temp_c",
    "dewpoint_c",
    "wind_speed_ms",
    "visibility_m",
    "pressure_hpa",
    "precip_mm_total_mm",
    "weather_sample_count",

    # Solar metrics
    "ghi_w_m2",
    "dni_w_m2",
    "dhi_w_m2",
    "solar_sample_count",
]

# Ensure all columns exist (some may be missing if schema differs slightly)
existing_cols = [c for c in final_cols if c in joined.columns]
joined_final = joined.select(*existing_cols)


In [0]:
(
    joined_final
    .write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .partitionBy("obs_date")
    .saveAsTable(f"{SILVER_SCHEMA}.usage_weather_joined")
)

In [0]:
display(spark.table(f"{SILVER_SCHEMA}.usage_weather_joined").limit(10))