In [0]:
from pyspark.sql import functions as F, Window

catalog = "energy_data_platform_project"
bronze_schema = f"{catalog}.bronze"
silver_schema = f"{catalog}.silver"

In [0]:
bronze_df = spark.table(f"{bronze_schema}.noaa_isd_ksfo_bronze")

In [0]:
df = (
    bronze_df
    .withColumn("obs_time_utc", F.col("obs_time").cast("timestamp"))
    .withColumn(
        "obs_time_local",
        F.from_utc_timestamp("obs_time_utc", "America/Los_Angeles")
    )
    .withColumn("obs_date", F.to_date("obs_time_local"))
    .withColumn("year", F.year("obs_date").cast("int"))
    .withColumnRenamed("value", "raw_record")
)

# QC: filter obviously invalid values
df = df.filter(
    (F.col("temperature_c").isNull()) |
    F.col("temperature_c").between(-40, 50)
)

df = df.filter(
    (F.col("wind_speed_ms").isNull()) |
    F.col("wind_speed_ms").between(0, 80)
)

df = df.filter(
    (F.col("visibility_m").isNull()) |
    (F.col("visibility_m") >= 0)
)

# Deduplicate per station & timestamp
w = Window.partitionBy("station_id", "obs_time_utc").orderBy(
    F.col("ingest_ts").desc()
)

df = (
    df
    .withColumn("rn", F.row_number().over(w))
    .filter(F.col("rn") == 1)
    .drop("rn")
)

# Final column selection
df = df.select(
    "obs_time_utc",
    "obs_time_local",
    "obs_date",
    "year",
    "station_id",
    "usaf",
    "wban",
    "latitude",
    "longitude",
    "elevation_m",
    "temperature_c",
    "dewpoint_c",
    "pressure_hpa",
    "wind_speed_ms",
    "wind_dir_deg",
    "ceiling_m",
    "visibility_m",
    "precip_mm",
    "raw_record",
    "source_file",
    "ingest_ts",
)


In [0]:
(
    df
    .write
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .partitionBy("obs_date")
    .saveAsTable(f"{silver_schema}.noaa_isd_ksfo_clean")
)

display(spark.table(f"{silver_schema}.noaa_isd_ksfo_clean").limit(5))
