In [0]:
%run /Users/sireeshabyreddy96@gmail.com/real-time-weather-pipeline/Medalian_notebooks/Slack_utils


In [0]:
from pyspark.sql.functions import (
    col, avg, min, max, count, to_date, hour, desc
)

In [0]:
from pyspark.sql.utils import AnalysisException

silver_table = "weather_catalog.raw.silver_table"

try:
    # Try reading the silver table
    df_silver = spark.table(silver_table)
    display(df_silver)
    send_slack_message(f" Successfully loaded table: {silver_table}", level="INFO")

    # Count rows
    row_count = df_silver.count()
    send_slack_message(f" Row count for {silver_table}: {row_count}", level="INFO")
    print(f"Row count in {silver_table}: {row_count}")

except AnalysisException as ae:
    send_slack_message(f" Table {silver_table} not found or invalid: {ae}", level="ERROR")
    df_silver = None   # Fallback handling

except Exception as e:
    send_slack_message(f" Unexpected error while reading {silver_table}: {e}", level="ERROR")
    df_silver = None   # Fallback handling

if df_silver:
    df_silver.printSchema()
    display(df_silver)
df_silver.count()

Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql import functions as F

silver_table = "weather_catalog.raw.silver_table"

def safe_slack(msg, level="INFO"):
    """Send Slack message safely without breaking ETL"""
    try:
        send_slack_message(msg, level)
    except Exception as slack_err:
        print(f"⚠️ Slack notification failed: {slack_err}")

try:
    # -------------------------
    # Read Silver Layer
    # -------------------------
    df_silver = spark.read.table(silver_table)
    safe_slack(f"Successfully loaded table: {silver_table}", level="INFO")

    # -------------------------
    # Ensure date column is proper
    # -------------------------
    df_silver = df_silver.withColumn("date", F.to_date("date_time", "dd-MM-yyyy HH:mm"))

    # -------------------------
    # Extract time dimensions
    # -------------------------
    df_silver = df_silver.withColumn("year", F.year("date")) \
                         .withColumn("month", F.month("date")) \
                         .withColumn("week", F.weekofyear("date"))

    # -------------------------
    # Select numeric columns for aggregation
    # -------------------------
    metrics = [
        "maxtempC", "mintempC", "tempC", "humidity", "pressure",
        "windspeedKmph", "WindGustKmph", "precipMM", "visibility",
        "FeelsLikeC", "HeatIndexC", "WindChillC", "DewPointC", "uvIndex", "sunHour"
    ]

    # Convert to numeric
    for col in metrics:
        df_silver = df_silver.withColumn(col, F.col(col).cast("double"))

    # -------------------------
    # WEEKLY AGGREGATION
    # -------------------------
    weekly_df = df_silver.groupBy("year", "week") \
        .agg(
            *[F.avg(col).alias(f"{col}_avg") for col in metrics],
            *[F.max(col).alias(f"{col}_max") for col in metrics],
            *[F.min(col).alias(f"{col}_min") for col in metrics]
        ).orderBy("year", "week")

    # -------------------------
    # MONTHLY AGGREGATION
    # -------------------------
    monthly_df = df_silver.groupBy("year", "month") \
        .agg(
            *[F.avg(col).alias(f"{col}_avg") for col in metrics],
            *[F.max(col).alias(f"{col}_max") for col in metrics],
            *[F.min(col).alias(f"{col}_min") for col in metrics]
        ).orderBy("year", "month")

    # -------------------------
    # YEARLY AGGREGATION
    # -------------------------
    yearly_df = df_silver.groupBy("year") \
        .agg(
            *[F.avg(col).alias(f"{col}_avg") for col in metrics],
            *[F.max(col).alias(f"{col}_max") for col in metrics],
            *[F.min(col).alias(f"{col}_min") for col in metrics]
        ).orderBy("year")

    # -------------------------
    # Save to Gold Layer
    # -------------------------
    weekly_df.write.format("delta").mode("overwrite").saveAsTable("weather_catalog.gold.weather_weekly")
    monthly_df.write.format("delta").mode("overwrite").saveAsTable("weather_catalog.gold.weather_monthly")
    yearly_df.write.format("delta").mode("overwrite").saveAsTable("weather_catalog.gold.weather_yearly")

    safe_slack("Weekly, Monthly, and Yearly aggregations computed and saved to Gold Layer.", level="INFO")

    # -------------------------
    # Show results in Databricks
    # -------------------------
    display(weekly_df)
    display(monthly_df)
    display(yearly_df)

except Exception as e:
    safe_slack(f"Error during aggregation ETL: {str(e)}", level="ERROR")
    raise


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql import functions as F

try:
    # -------------------------
    # Read Silver Layer
    # -------------------------
    df_silver = spark.read.format("delta").table("weather_catalog.raw.silver_table")

    # -------------------------
    # List of numeric columns to aggregate
    # -------------------------
    numeric_cols = [
        "maxtempC", "mintempC", "totalSnow_cm", "sunHour", "uvIndex", "moon_illumination",
        "DewPointC", "FeelsLikeC", "HeatIndexC", "WindChillC", "WindGustKmph",
        "cloudcover", "humidity", "precipMM", "pressure", "tempC", "visibility",
        "winddirDegree", "windspeedKmph"
    ]

    # -------------------------
    # Build aggregation expressions
    # -------------------------
    aggregations = []
    for c in numeric_cols:
        aggregations.append(F.max(c).alias(f"{c}_max"))
        aggregations.append(F.min(c).alias(f"{c}_min"))
        aggregations.append(F.avg(c).alias(f"{c}_avg"))

    # -------------------------
    # Group by City
    # -------------------------
    df_citywise_stats = df_silver.groupBy("City").agg(*aggregations)

    # -------------------------
    # Save results to Gold Layer
    # -------------------------
    df_citywise_stats.write.format("delta").mode("overwrite").saveAsTable(
        "weather_catalog.gold.citywise_stats"
    )

    # -------------------------
    # Slack notification (safe)
    # -------------------------
    try:
        send_slack_message(":white_check_mark: Citywise weather statistics computed and saved to `weather_catalog.gold.citywise_stats`.")
    except Exception as slack_err:
        print(f" Slack notification failed: {slack_err}")

    # -------------------------
    # Show results in Databricks
    # -------------------------
    display(df_citywise_stats)

except Exception as e:
    error_message = f":x: Error while computing citywise stats: {str(e)}"
    try:
        send_slack_message(error_message)
    except Exception as slack_err:
        print(f" Slack notification failed: {slack_err}")
    raise


In [0]:
from pyspark.sql import functions as F

try:
    # -------------------------
    # Read Silver Layer
    # -------------------------
    df_silver = spark.read.format("delta").table("weather_catalog.raw.silver_table")

    # -------------------------
    # Numeric columns to aggregate
    # -------------------------
    numeric_cols = [
        "maxtempC", "mintempC", "totalSnow_cm", "sunHour", "uvIndex", "moon_illumination",
        "DewPointC", "FeelsLikeC", "HeatIndexC", "WindChillC", "WindGustKmph",
        "cloudcover", "humidity", "precipMM", "pressure", "tempC", "visibility",
        "winddirDegree", "windspeedKmph"
    ]

    # -------------------------
    # Build aggregation expressions
    # -------------------------
    aggregations = []
    for c in numeric_cols:
        aggregations.append(F.max(c).alias(f"{c}_max"))
        aggregations.append(F.min(c).alias(f"{c}_min"))
        aggregations.append(F.avg(c).alias(f"{c}_avg"))

    # -------------------------
    # Group by City
    # -------------------------
    df_citywise_stats = df_silver.groupBy("City").agg(*aggregations)

    # -------------------------
    # Save to Gold Layer
    # -------------------------
    df_citywise_stats.write.format("delta").mode("overwrite").saveAsTable("weather_catalog.gold.citywise_stats")

    # -------------------------
    # Slack notification
    # -------------------------
    send_slack_message(" Citywise weather statistics computed and saved to `weather_catalog.gold.citywise_stats`.", level="INFO")

    # -------------------------
    # Display results
    # -------------------------
    display(df_citywise_stats)

except Exception as e:
    send_slack_message(f" Error while computing citywise stats: {str(e)}", level="ERROR")
    raise


In [0]:
from pyspark.sql import functions as F

try:
    # -------------------------
    # Group by city and calculate aggregate stats
    # -------------------------
    city_stats = df_silver.groupBy("City").agg(
        F.max("maxtempC").alias("max_temp"),
        F.min("mintempC").alias("min_temp"),
        F.sum("precipMM").alias("total_rain"),
        F.avg("humidity").alias("avg_humidity"),
        F.max("humidity").alias("max_humidity"),
        F.min("humidity").alias("min_humidity"),
        F.max("windspeedKmph").alias("max_wind"),
        F.min("windspeedKmph").alias("min_wind"),
        F.max("sunHour").alias("max_sun"),
        F.max("cloudcover").alias("max_cloud"),
        F.max("visibility").alias("max_visibility"),
        F.min("visibility").alias("min_visibility"),
        F.max("pressure").alias("max_pressure"),
        F.min("pressure").alias("min_pressure")
    )

    # -------------------------
    # Pick top cities for each category
    # -------------------------
    insights = {
        "Hottest City": city_stats.orderBy(F.desc("max_temp")).limit(1),
        "Coolest City": city_stats.orderBy("min_temp").limit(1),
        "Most Rainfall City": city_stats.orderBy(F.desc("total_rain")).limit(1),
        "Most Humid City": city_stats.orderBy(F.desc("max_humidity")).limit(1),
        "Driest City": city_stats.orderBy("min_humidity").limit(1),
        "Windiest City": city_stats.orderBy(F.desc("max_wind")).limit(1),
        "Calmest City": city_stats.orderBy("min_wind").limit(1),
        "Sunniest City": city_stats.orderBy(F.desc("max_sun")).limit(1),
        "Cloudiest City": city_stats.orderBy(F.desc("max_cloud")).limit(1),
        "Best Visibility City": city_stats.orderBy(F.desc("max_visibility")).limit(1),
        "Worst Visibility City": city_stats.orderBy("min_visibility").limit(1),
        "Highest Pressure City": city_stats.orderBy(F.desc("max_pressure")).limit(1),
        "Lowest Pressure City": city_stats.orderBy("min_pressure").limit(1),
    }

    # -------------------------
    # Show insights
    # -------------------------
    for title, df in insights.items():
        print(f"\n🔹 {title}:")
        display(df)

    # -------------------------
    # Slack success notification
    # -------------------------
    send_slack_message(" City-level weather insights computed successfully.", level="INFO")

except Exception as e:
    try:
        send_slack_message(f" Error while computing city-level insights: {str(e)}", level="ERROR")
    except Exception as slack_err:
        print(f" Slack notification failed: {slack_err}")
    raise


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, year, avg, min, max, sum
from pyspark.sql.window import Window

gold_base_path = "weather_catalog.gold"

try:
    # -------------------------
    # Extract year from date_time
    # -------------------------
    df_yearly = df_silver.withColumn("year", year(col("date_time")))

    # -------------------------
    # Year-wise coolest city (min temp)
    # -------------------------
    coolest_city = df_yearly.groupBy("year", "City") \
        .agg(min("mintempC").alias("min_temp")) \
        .withColumn("rank", 
            F.row_number().over(Window.partitionBy("year").orderBy("min_temp"))
        ).filter(col("rank") == 1).drop("rank")
    
    # Save Delta table
    coolest_city.write.format("delta").mode("overwrite").saveAsTable(f"{gold_base_path}.coolest_city")
    
    # -------------------------
    # Year-wise hottest city (max temp)
    # -------------------------
    hottest_city = df_yearly.groupBy("year", "City") \
        .agg(max("maxtempC").alias("max_temp")) \
        .withColumn("rank", 
            F.row_number().over(Window.partitionBy("year").orderBy(col("max_temp").desc()))
        ).filter(col("rank") == 1).drop("rank")
    
    hottest_city.write.format("delta").mode("overwrite").saveAsTable(f"{gold_base_path}.hottest_city")

    # -------------------------
    # Year-wise rainiest city (sum of precipitation)
    # -------------------------
    rainiest_city = df_yearly.groupBy("year", "City") \
        .agg(sum("precipMM").alias("total_rain")) \
        .withColumn("rank", 
            F.row_number().over(Window.partitionBy("year").orderBy(col("total_rain").desc()))
        ).filter(col("rank") == 1).drop("rank")
    
    rainiest_city.write.format("delta").mode("overwrite").saveAsTable(f"{gold_base_path}.rainiest_city")

    # -------------------------
    # Year-wise most humid city (avg humidity)
    # -------------------------
    most_humid_city = df_yearly.groupBy("year", "City") \
        .agg(avg("humidity").alias("avg_humidity")) \
        .withColumn("rank", 
            F.row_number().over(Window.partitionBy("year").orderBy(col("avg_humidity").desc()))
        ).filter(col("rank") == 1).drop("rank")
    
    most_humid_city.write.format("delta").mode("overwrite").saveAsTable(f"{gold_base_path}.most_humid_city")

    # -------------------------
    # Year-wise sunniest city (avg sunHour)
    # -------------------------
    sunniest_city = df_yearly.groupBy("year", "City") \
        .agg(avg("sunHour").alias("avg_sunHour")) \
        .withColumn("rank", 
            F.row_number().over(Window.partitionBy("year").orderBy(col("avg_sunHour").desc()))
        ).filter(col("rank") == 1).drop("rank")
    
    sunniest_city.write.format("delta").mode("overwrite").saveAsTable(f"{gold_base_path}.sunniest_city")

    # -------------------------
    # Year-wise windiest city (avg windspeed)
    # -------------------------
    windiest_city = df_yearly.groupBy("year", "City") \
        .agg(avg("windspeedKmph").alias("avg_wind")) \
        .withColumn("rank", 
            F.row_number().over(Window.partitionBy("year").orderBy(col("avg_wind").desc()))
        ).filter(col("rank") == 1).drop("rank")
    
    windiest_city.write.format("delta").mode("overwrite").saveAsTable(f"{gold_base_path}.windiest_city")

    # -------------------------
    # Display results
    # -------------------------
    display(coolest_city)
    display(hottest_city)
    display(rainiest_city)
    display(most_humid_city)
    display(sunniest_city)
    display(windiest_city)

    # -------------------------
    # Slack success notification
    # -------------------------
    send_slack_message(" Year-wise city-level weather insights computed and saved to Gold layer successfully.", level="INFO")

except Exception as e:
    try:
        send_slack_message(f" Error during year-wise city-level ETL: {str(e)}", level="ERROR")
    except Exception as slack_err:
        print(f" Slack notification failed: {slack_err}")
    raise
