In [0]:
%run /Users/sireeshabyreddy96@gmail.com/real-time-weather-pipeline/Medalian_notebooks/Slack_utils

In [0]:
from pyspark.sql.functions import col, regexp_replace, split, explode, from_json, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
import requests
import json


# -----------------------------
path = "s3://weather-streaming-proj-bucket/2025/08/22/04/*"

try:
    # -----------------------------
    # Step 1: Read S3 files
    # -----------------------------
    try:
        df_raw = spark.read.text(path)
    except Exception as s3_error:
        send_slack_message(f" Bronze Layer S3 Read Failed: {str(s3_error)}")
        raise s3_error  # stop pipeline immediately

    # -----------------------------
    # Step 2: Clean newlines
    # -----------------------------
    df_clean = df_raw.withColumn("value", regexp_replace(col("value"), r"\n", ""))

    # -----------------------------
    # Step 3: Split concatenated JSON {...}{...} into array
    # -----------------------------
    df_split = df_clean.withColumn("json_str", split(regexp_replace(col("value"), r"}\{", "}|||{"), r"\|\|\|"))

    # -----------------------------
    # Step 4: Explode into rows
    # -----------------------------
    df_exploded = df_split.withColumn("json_str", explode(col("json_str")))

    # -----------------------------
    # Step 5: Keep only valid JSON-looking rows
    # -----------------------------
    df_valid = df_exploded.filter(col("json_str").rlike(r"^\s*\{.*\}\s*$"))

    # -----------------------------
    # Step 6: Parse into structured columns
    # -----------------------------
    weather_schema = StructType([
        StructField("date_time", StringType(), True),
        StructField("maxtempC", StringType(), True),
        StructField("mintempC", StringType(), True),
        StructField("totalSnow_cm", StringType(), True),
        StructField("sunHour", StringType(), True),
        StructField("uvIndex", StringType(), True),
        StructField("moon_illumination", StringType(), True),
        StructField("moonrise", StringType(), True),
        StructField("moonset", StringType(), True),
        StructField("sunrise", StringType(), True),
        StructField("sunset", StringType(), True),
        StructField("DewPointC", StringType(), True),
        StructField("FeelsLikeC", StringType(), True),
        StructField("HeatIndexC", StringType(), True),
        StructField("WindChillC", StringType(), True),
        StructField("WindGustKmph", StringType(), True),
        StructField("cloudcover", StringType(), True),
        StructField("humidity", StringType(), True),
        StructField("precipMM", StringType(), True),
        StructField("pressure", StringType(), True),
        StructField("tempC", StringType(), True),
        StructField("visibility", StringType(), True),
        StructField("winddirDegree", StringType(), True),
        StructField("windspeedKmph", StringType(), True),
        StructField("City", StringType(), True)
    ])

    try:
        df_parsed = df_valid.withColumn("parsed", from_json(col("json_str"), weather_schema)).select("parsed.*")
    except Exception as parse_error:
        send_slack_message(f" Bronze Layer JSON Parsing Failed: {str(parse_error)}")
        raise parse_error

    # -----------------------------
    # Step 7: Cast numeric columns safely
    # -----------------------------
    numeric_cols = ["maxtempC","mintempC","totalSnow_cm","sunHour","uvIndex",
                    "DewPointC","FeelsLikeC","HeatIndexC","WindChillC",
                    "WindGustKmph","cloudcover","humidity","precipMM","pressure",
                    "tempC","visibility","winddirDegree","windspeedKmph"]

    try:
        for c in numeric_cols:
            df_parsed = df_parsed.withColumn(
                c, 
                when(col(c).rlike(r"^\d+(\.\d+)?$"), col(c).cast(DoubleType())).otherwise(None)
            )
    except Exception as cast_error:
        send_slack_message(f" Bronze Layer Numeric Casting Failed: {str(cast_error)}")
        raise cast_error

    # -----------------------------
    # Step 8: Display and log
    # -----------------------------
    display(df_parsed)
    df_parsed.printSchema()
    total_records = df_parsed.count()
    print("Total Bronze records:", total_records)

    # -----------------------------
    # Step 9: Success notification
    # -----------------------------
    send_slack_message(f" Bronze Layer Success: {total_records} records processed.")

except Exception as e:
    # Catch any other unexpected errors
    send_slack_message(f" Bronze Layer Failed: {str(e)}")
    raise


In [0]:
from pyspark.sql.functions import col, regexp_replace, split, explode, from_json, when
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
import requests
import json

# -----------------------------


path = "s3://weather-streaming-proj-bucket/2025/08/22/01/*"

try:
    # -----------------------------
    # Step 1: Read raw files
    # -----------------------------
    try:
        df_raw = spark.read.text(path)
    except Exception as s3_error:
        send_slack_message(f" Bronze Layer S3 Read Failed: {str(s3_error)}")
        raise s3_error

    # -----------------------------
    # Step 2: Clean newlines
    # -----------------------------
    df_clean = df_raw.withColumn("value", regexp_replace(col("value"), r"\n", ""))

    # -----------------------------
    # Step 3: Split concatenated JSON
    # -----------------------------
    df_split = df_clean.withColumn(
        "json_str",
        split(regexp_replace(col("value"), r"}\{", "}|||{"), r"\|\|\|")
    )

    # -----------------------------
    # Step 4: Explode rows
    # -----------------------------
    df_exploded = df_split.withColumn("json_str", explode(col("json_str")))

    # -----------------------------
    # Step 5: Keep valid JSON
    # -----------------------------
    df_valid = df_exploded.filter(col("json_str").rlike(r"^\s*\{.*\}\s*$"))

    # -----------------------------
    # Step 6: Parse JSON using schema
    # -----------------------------
    try:
        df_parsed1 = df_valid.withColumn("parsed", from_json(col("json_str"), weather_schema)).select("parsed.*")
    except Exception as parse_error:
        send_slack_message(f" Bronze Layer JSON Parsing Failed: {str(parse_error)}")
        raise parse_error

    # -----------------------------
    # Step 7: Cast numeric columns safely
    # -----------------------------
    numeric_cols = ["maxtempC","mintempC","totalSnow_cm","sunHour","uvIndex",
                    "DewPointC","FeelsLikeC","HeatIndexC","WindChillC",
                    "WindGustKmph","cloudcover","humidity","precipMM","pressure",
                    "tempC","visibility","winddirDegree","windspeedKmph"]

    try:
        for c in numeric_cols:
            df_parsed1 = df_parsed1.withColumn(
                c, when(col(c).rlike(r"^\d+(\.\d+)?$"), col(c).cast(DoubleType())).otherwise(None)
            )
    except Exception as cast_error:
        send_slack_message(f" Bronze Layer Numeric Casting Failed: {str(cast_error)}")
        raise cast_error

    # -----------------------------
    # Step 8: Show results
    # -----------------------------
    display(df_parsed1)
    df_parsed1.printSchema()
    total_records = df_parsed1.count()
    print("Total Bronze records:", total_records)

    # -----------------------------
    # Step 9: Slack success notification
    # -----------------------------
    send_slack_message(f" Bronze Layer Success: {total_records} records processed.")

except Exception as e:
    # Catch any unexpected errors
    send_slack_message(f" Bronze Layer Failed: {str(e)}")
    raise


In [0]:
try:
    df_merged = df_parsed.unionByName(df_parsed1, allowMissingColumns=True)
    
    # Display and log
    display(df_merged)
    df_merged.printSchema()
    total_records = df_merged.count()
    print("Total records after merge:", total_records)
    
    # Slack success notification
    send_slack_message(f" Bronze Layer Merge Success: {total_records} total records.")

except Exception as merge_error:
    send_slack_message(f" Bronze Layer Merge Failed: {str(merge_error)}")
    raise merge_error


In [0]:
try:
    # Write merged DataFrame to Bronze Delta table
    df_merged.write.format("delta").mode("overwrite").saveAsTable("weather_catalog.raw.weather_bronze")
    
    # Display and count records
    display(df_merged)
    total_records = df_merged.count()
    print("Total records written to Bronze table:", total_records)
    
    # Slack notification for successful write
    send_slack_message(f" Bronze Layer Written Successfully: {total_records} records in weather_catalog.raw.weather_bronze.")

except Exception as write_error:
    # Slack notification for write failure
    send_slack_message(f" Bronze Layer Write Failed: {str(write_error)}")
    raise write_error


In [0]:
df_merged.printSchema()