In [0]:
%run /Workspace/Users/sireeshabyreddy96@gmail.com/real-time-weather-pipeline/Medalian_notebooks/Slack_utils

In [0]:
from pyspark.sql.functions import (
    col, avg, min, max, count, to_date, hour, desc
)

In [0]:
from pyspark.sql.functions import col, when
from pyspark.sql.types import DoubleType
from pyspark.sql.utils import AnalysisException

silver_table = "weather_catalog.raw.silver_table"
checkpoint_path = "s3://weather-streaming-proj-bucket/checkpoints"

try:
    # Read Silver table as streaming
    df_silver = spark.readStream.format("delta").table(silver_table)
    display(df_silver)

    # Count rows (batch read for monitoring)
    row_count = spark.table(silver_table).count()
    print(f"Row count in {silver_table}: {row_count}")

except AnalysisException as ae:
    print(f"Table {silver_table} not found or invalid: {ae}")
    df_silver = None

except Exception as e:
    print(f"Unexpected error while reading {silver_table}: {e}")
    df_silver = None

if df_silver:
    df_silver.printSchema()
    display(df_silver)

    # Start streaming query to print data to console
    query = (
        df_silver.writeStream
        .format("console")
        .option("checkpointLocation", checkpoint_path)
        .option("skipChangeCommits", "true")
        .start()
    )
    query.awaitTermination()

Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql import functions as F

silver_table = "weather_catalog.raw.silver_table"


def safe_slack(msg, level="INFO"):
    """Send Slack message safely without breaking ETL"""
    try:
        send_slack_message(msg, level)
    except Exception as slack_err:
        print(f"⚠️ Slack notification failed: {slack_err}")

try:
    # -------------------------
    # Read Silver Layer as STREAM
    # -------------------------
    df_silver = spark.readStream.format("delta").table(silver_table)
    safe_slack(f"Successfully started streaming from table: {silver_table}", level="INFO")

    # -------------------------
    # Ensure date column is proper
    # -------------------------
    df_silver = df_silver.withColumn("date", F.to_date("date_time", "dd-MM-yyyy HH:mm"))

    # -------------------------
    # Extract time dimensions
    # -------------------------
    df_silver = df_silver.withColumn("year", F.year("date")) \
                         .withColumn("month", F.month("date")) \
                         .withColumn("week", F.weekofyear("date"))

    # -------------------------
    # Select numeric columns for aggregation
    # -------------------------
    metrics = [
        "maxtempC", "mintempC", "tempC", "humidity", "pressure",
        "windspeedKmph", "WindGustKmph", "precipMM", "visibility",
        "FeelsLikeC", "HeatIndexC", "WindChillC", "DewPointC", "uvIndex", "sunHour"
    ]

    # Convert to numeric
    for col in metrics:
        df_silver = df_silver.withColumn(col, F.col(col).cast("double"))

    # -------------------------
    # WEEKLY AGGREGATION
    # -------------------------
    weekly_df = df_silver.groupBy("year", "week") \
        .agg(
            *[F.avg(col).alias(f"{col}_avg") for col in metrics],
            *[F.max(col).alias(f"{col}_max") for col in metrics],
            *[F.min(col).alias(f"{col}_min") for col in metrics]
        ).orderBy("year", "week")

    # -------------------------
    # MONTHLY AGGREGATION
    # -------------------------
    monthly_df = df_silver.groupBy("year", "month") \
        .agg(
            *[F.avg(col).alias(f"{col}_avg") for col in metrics],
            *[F.max(col).alias(f"{col}_max") for col in metrics],
            *[F.min(col).alias(f"{col}_min") for col in metrics]
        ).orderBy("year", "month")

    # -------------------------
    # YEARLY AGGREGATION
    # -------------------------
    yearly_df = df_silver.groupBy("year") \
        .agg(
            *[F.avg(col).alias(f"{col}_avg") for col in metrics],
            *[F.max(col).alias(f"{col}_max") for col in metrics],
            *[F.min(col).alias(f"{col}_min") for col in metrics]
        ).orderBy("year")

    # -------------------------
    # Write streaming output to Gold Layer
    # -------------------------
    weekly_df.writeStream.format("delta") \
        .outputMode("complete") \
        .option("skipChangeCommits", "true") \
        .option("checkpointLocation", f"{checkpoint_path}/gold_weekly") \
        .toTable("weather_catalog.gold.weather_weekly")

    monthly_df.writeStream.format("delta") \
        .option("skipChangeCommits", "true") \
        .outputMode("complete") \
        .option("checkpointLocation", f"{checkpoint_path}/gold_monthly") \
        .toTable("weather_catalog.gold.weather_monthly")

    yearly_df.writeStream.format("delta") \
        .outputMode("complete") \
        .option("skipChangeCommits", "true") \
        .option("checkpointLocation", f"{checkpoint_path}/gold_yearly") \
        .toTable("weather_catalog.gold.weather_yearly")

    safe_slack("Weekly, Monthly, and Yearly aggregations are streaming to Gold Layer.", level="INFO")

except Exception as e:
    safe_slack(f"Error during aggregation ETL: {str(e)}", level="ERROR")
    raise


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:
try:
    df_silver = spark.readStream.format("delta").table("weather_catalog.raw.silver_table")

    # -------------------------
    # List of numeric columns to aggregate
    # -------------------------
    numeric_cols = [
        "maxtempC", "mintempC", "totalSnow_cm", "sunHour", "uvIndex", "moon_illumination",
        "DewPointC", "FeelsLikeC", "HeatIndexC", "WindChillC", "WindGustKmph",
        "cloudcover", "humidity", "precipMM", "pressure", "tempC", "visibility",
        "winddirDegree", "windspeedKmph"
    ]

    # -------------------------
    # Build aggregation expressions
    # -------------------------
    aggregations = []
    for c in numeric_cols:
        aggregations.append(F.max(c).alias(f"{c}_max"))
        aggregations.append(F.min(c).alias(f"{c}_min"))
        aggregations.append(F.avg(c).alias(f"{c}_avg"))

    # -------------------------
    # Group by City
    # -------------------------
    df_citywise_stats = df_silver.groupBy("City").agg(*aggregations)

    # -------------------------
    # Write streaming output to Gold Layer
    # -------------------------
    df_citywise_stats.writeStream \
        .format("delta") \
        .outputMode("complete") \
        .option("skipChangeCommits", "true")
        .option("checkpointLocation", f"{checkpoint_path}/gold_citywise") \
        .toTable("weather_catalog.gold.citywise_stats")

    # -------------------------
    # Slack notification (safe)
    # -------------------------
    try:
        send_slack_message(":white_check_mark: Citywise weather statistics streaming to Gold Layer.")
    except Exception as slack_err:
        print(f" Slack notification failed: {slack_err}")

    # -------------------------
    # Display results (will update in streaming)
    # -------------------------
    display(df_citywise_stats)

except Exception as e:
    error_message = f":x: Error while computing citywise stats: {str(e)}"
    try:
        send_slack_message(error_message)
    except Exception as slack_err:
        print(f" Slack notification failed: {slack_err}")
    raise

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

In [0]:

try:
    # -------------------------
    # Read Silver Layer as STREAM
    # -------------------------
    df_silver = spark.readStream.format("delta").table("weather_catalog.raw.silver_table")

    # -------------------------
    # Numeric columns to aggregate
    # -------------------------
    numeric_cols = [
        "maxtempC", "mintempC", "totalSnow_cm", "sunHour", "uvIndex", "moon_illumination",
        "DewPointC", "FeelsLikeC", "HeatIndexC", "WindChillC", "WindGustKmph",
        "cloudcover", "humidity", "precipMM", "pressure", "tempC", "visibility",
        "winddirDegree", "windspeedKmph"
    ]

    # -------------------------
    # Build aggregation expressions
    # -------------------------
    aggregations = []
    for c in numeric_cols:
        aggregations.append(F.max(c).alias(f"{c}_max"))
        aggregations.append(F.min(c).alias(f"{c}_min"))
        aggregations.append(F.avg(c).alias(f"{c}_avg"))

    # -------------------------
    # Group by City
    # -------------------------
    df_citywise_stats = df_silver.groupBy("City").agg(*aggregations)

    # -------------------------
    # Write streaming output to Gold Layer
    # -------------------------
    df_citywise_stats.writeStream \
        .format("delta") \
        .outputMode("complete") \
        .option("skipChangeCommits", "true") \
        .option("checkpointLocation", f"{checkpoint_path}/gold_citywise") \
        .toTable("weather_catalog.gold.citywise_stats")

    # -------------------------
    # Slack notification
    # -------------------------
    send_slack_message("Citywise weather statistics streaming to Gold Layer.", level="INFO")

    # -------------------------
    # Display results (updates as stream progresses)
    # -------------------------
    display(df_citywise_stats)

except Exception as e:
    send_slack_message(f"Error while computing citywise stats: {str(e)}", level="ERROR")
    raise

Databricks visualization. Run in Databricks to view.

In [0]:
from pyspark.sql import functions as F


try:
    # -------------------------
    # Read Silver Layer as STREAM
    # -------------------------
    df_silver = spark.readStream.format("delta").table("weather_catalog.raw.silver_table")

    # -------------------------
    # Group by city and calculate aggregate stats
    # -------------------------
    city_stats = df_silver.groupBy("City").agg(
        F.max("maxtempC").alias("max_temp"),
        F.min("mintempC").alias("min_temp"),
        F.sum("precipMM").alias("total_rain"),
        F.avg("humidity").alias("avg_humidity"),
        F.max("humidity").alias("max_humidity"),
        F.min("humidity").alias("min_humidity"),
        F.max("windspeedKmph").alias("max_wind"),
        F.min("windspeedKmph").alias("min_wind"),
        F.max("sunHour").alias("max_sun"),
        F.max("cloudcover").alias("max_cloud"),
        F.max("visibility").alias("max_visibility"),
        F.min("visibility").alias("min_visibility"),
        F.max("pressure").alias("max_pressure"),
        F.min("pressure").alias("min_pressure")
    )

    # -------------------------
    # Optionally write streaming results to Gold Layer
    # -------------------------
    city_stats.writeStream \
        .format("delta") \
        .outputMode("complete") \
        .option("skipChangeCommits", "true") \
        .option("checkpointLocation", f"{checkpoint_path}/gold_city_insights") \
        .toTable("weather_catalog.gold.city_insights")

    # -------------------------
    # Pick top cities for each category
    # -------------------------
    insights = {
        "Hottest City": city_stats.orderBy(F.desc("max_temp")).limit(1),
        "Coolest City": city_stats.orderBy("min_temp").limit(1),
        "Most Rainfall City": city_stats.orderBy(F.desc("total_rain")).limit(1),
        "Most Humid City": city_stats.orderBy(F.desc("max_humidity")).limit(1),
        "Driest City": city_stats.orderBy("min_humidity").limit(1),
        "Windiest City": city_stats.orderBy(F.desc("max_wind")).limit(1),
        "Calmest City": city_stats.orderBy("min_wind").limit(1),
        "Sunniest City": city_stats.orderBy(F.desc("max_sun")).limit(1),
        "Cloudiest City": city_stats.orderBy(F.desc("max_cloud")).limit(1),
        "Best Visibility City": city_stats.orderBy(F.desc("max_visibility")).limit(1),
        "Worst Visibility City": city_stats.orderBy("min_visibility").limit(1),
        "Highest Pressure City": city_stats.orderBy(F.desc("max_pressure")).limit(1),
        "Lowest Pressure City": city_stats.orderBy("min_pressure").limit(1),
    }

    # -------------------------
    # Show insights (streaming updates)
    # -------------------------
    for title, df in insights.items():
        print(f"\n🔹 {title}:")
        display(df)

    # -------------------------
    # Slack success notification
    # -------------------------
    send_slack_message("City-level weather insights streaming successfully.", level="INFO")

except Exception as e:
    try:
        send_slack_message(f"Error while computing city-level insights: {str(e)}", level="ERROR")
    except Exception as slack_err:
        print(f" Slack notification failed: {slack_err}")
    raise


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col, year, avg, min, max, sum
from pyspark.sql.window import Window

gold_base_path = "weather_catalog.gold"
checkpoint_base = "s3://weather-streaming-proj-bucket/gold_checkpoints"

try:
    # -------------------------
    # Read Silver table as streaming
    # -------------------------
    df_silver_stream = spark.readStream.format("delta").table("weather_catalog.raw.silver_table")

    # -------------------------
    # Add year column
    # -------------------------
    df_yearly = df_silver_stream.withColumn("year", year(col("date_time")))

    # -------------------------
    # Function to compute metrics and write as streaming Delta
    # -------------------------
    def write_gold_stream(df, agg_col, agg_func, table_name, desc=False):
        df_agg = df.groupBy("year", "City").agg(agg_func(col(agg_col)).alias(f"{agg_col}_agg"))
        df_ranked = df_agg.withColumn(
            "rank",
            F.row_number().over(Window.partitionBy("year").orderBy(
                col(f"{agg_col}_agg").desc() if desc else col(f"{agg_col}_agg")
            ))
        ).filter(col("rank") == 1).drop("rank")

        return df_ranked.writeStream \
            .format("delta") \
            .outputMode("complete") \
            .option("checkpointLocation", f"{checkpoint_base}/{table_name}") \
            .toTable(f"{gold_base_path}.{table_name}")

    # -------------------------
    # Year-wise metrics streaming
    # -------------------------
    write_gold_stream(df_yearly, "mintempC", min, "coolest_city")
    write_gold_stream(df_yearly, "maxtempC", max, "hottest_city", desc=True)
    write_gold_stream(df_yearly, "precipMM", sum, "rainiest_city", desc=True)
    write_gold_stream(df_yearly, "humidity", avg, "most_humid_city", desc=True)
    write_gold_stream(df_yearly, "sunHour", avg, "sunniest_city", desc=True)
    write_gold_stream(df_yearly, "windspeedKmph", avg, "windiest_city", desc=True)

    # -------------------------
    # Slack success notification
    # -------------------------
    send_slack_message(" Gold layer streaming started successfully.", level="INFO")

except Exception as e:
    try:
        send_slack_message(f" Error during Gold layer streaming ETL: {str(e)}", level="ERROR")
    except Exception as slack_err:
        print(f" Slack notification failed: {slack_err}")
    raise


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.