In [0]:
from pyspark.sql import functions as F, Window

catalog = "energy_data_platform_project"
bronze_schema = f"{catalog}.bronze"
silver_schema = f"{catalog}.silver"

In [0]:
bronze_df = spark.table(f"{bronze_schema}.ca_solar_irradiance_raw")

In [0]:
df = (
    bronze_df
    .withColumn("obs_time_utc", F.col("obs_time").cast("timestamp"))
    .withColumn(
        "obs_time_local",
        F.from_utc_timestamp("obs_time_utc", "America/Los_Angeles")
    )
    .withColumn("obs_date", F.to_date("obs_time_local"))
    .withColumn("year", F.year("obs_date").cast("int"))
    .withColumnRenamed("site_id", "solar_site_id")
    .withColumnRenamed("ghi", "raw_ghi")
    .withColumnRenamed("dni", "raw_dni")
    .withColumnRenamed("dhi", "raw_dhi")
)

# QC: discard rows with all three irradiances null; clip to physical ranges
df = df.filter(
    (F.col("ghi_w_m2").isNotNull()) |
    (F.col("dni_w_m2").isNotNull()) |
    (F.col("dhi_w_m2").isNotNull())
)

df = df.filter(
    (F.col("ghi_w_m2").between(0, 1400)) |
    F.col("ghi_w_m2").isNull()
)

# Deduplicate per site & timestamp
w = Window.partitionBy("solar_site_id", "obs_time_utc").orderBy(
    F.col("ingest_ts").desc()
)

df = (
    df
    .withColumn("rn", F.row_number().over(w))
    .filter(F.col("rn") == 1)
    .drop("rn")
)

# Final column selection
df = df.select(
    "obs_time_utc",
    "obs_time_local",
    "obs_date",
    "year",
    "solar_site_id",
    "ghi_w_m2",
    "dni_w_m2",
    "dhi_w_m2",
    "source_file",
    "ingest_ts",
)


In [0]:
(
    df
    .write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .partitionBy("obs_date")
    .saveAsTable(f"{silver_schema}.ca_solar_irradiance_clean")
)

display(spark.table(f"{silver_schema}.ca_solar_irradiance_clean").limit(5))
