In [0]:
from pyspark.sql.functions import (
    col, avg, min, max, count, to_date, hour, desc
)

In [0]:
silver_table = "weather_catalog.processed.valid_readings"
df_silver = spark.table(silver_table)

In [0]:
df_daily = (
    df_silver.withColumn("date", to_date("date_time"))
    .groupBy("City", "date")
    .agg(
        avg("tempC").alias("avg_tempC"),
        min("tempC").alias("min_tempC"),
        max("tempC").alias("max_tempC"),
        avg("humidity").alias("avg_humidity"),
        avg("windspeedKmph").alias("avg_windspeed"),
        count("*").alias("records_count")
    )
)

In [0]:
df_city_stats = (
    df_silver.withColumn("date", to_date("date_time"))
    .groupBy("City", "date")
    .agg(
        avg("tempC").alias("avg_tempC"),
        max("tempC").alias("max_tempC"),
        min("tempC").alias("min_tempC")
    )
)

In [0]:
# Hottest city per day
df_hottest = df_city_stats.orderBy(desc("max_tempC")).limit(1)

# Coolest city per day
df_coolest = df_city_stats.orderBy("min_tempC").limit(1)

In [0]:
gold_daily_table = "weather_catalog.analytics.weather_daily_stats"
gold_city_table = "weather_catalog.analytics.city_extremes"


In [0]:
# Daily Aggregations → Gold
df_daily.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(gold_daily_table)

In [0]:
df_city_stats.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(gold_city_table)

In [0]:
print("✅ Gold layer tables created:")
print(f"- Daily Stats Table: {df_daily.count()} records")
print(f"- City Stats Table: {df_city_stats.count()} records")


In [0]:
print("🔥 Hottest City:")
display(df_hottest)

print("❄️ Coolest City:")
display(df_coolest)