In [0]:
%run /Workspace/Users/sireeshabyreddy96@gmail.com/real-time-weather-pipeline/Medalian_notebooks/Gold_Layer


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.sql.utils import AnalysisException

# -------------------------------
# Define test data (sample)
# -------------------------------
test_data = [
    ("2025-08-23 06:00", 35.0, 25.0, 0.0, 10.0, 5, 45, "06:00 AM", "06:30 PM", "06:00 AM", "06:30 PM", 20, 35, 36, 19, 30, 50, 60, 0.0, 1010, 30, 10, 180, 15, "Hyderabad"),
    ("2025-08-23 06:00", 30.0, 22.0, 0.0, 9.0, 4, 40, "06:10 AM", "06:40 PM", "06:10 AM", "06:40 PM", 18, 30, 32, 17, 25, 45, 55, 0.0, 1012, 28, 12, 190, 10, "Bangalore")
]

schema = StructType([
    StructField("date_time", StringType(), True),
    StructField("maxtempC", DoubleType(), True),
    StructField("mintempC", DoubleType(), True),
    StructField("totalSnow_cm", DoubleType(), True),
    StructField("sunHour", DoubleType(), True),
    StructField("uvIndex", DoubleType(), True),
    StructField("moon_illumination", DoubleType(), True),
    StructField("moonrise", StringType(), True),
    StructField("moonset", StringType(), True),
    StructField("sunrise", StringType(), True),
    StructField("sunset", StringType(), True),
    StructField("DewPointC", DoubleType(), True),
    StructField("FeelsLikeC", DoubleType(), True),
    StructField("HeatIndexC", DoubleType(), True),
    StructField("WindChillC", DoubleType(), True),
    StructField("WindGustKmph", DoubleType(), True),
    StructField("cloudcover", DoubleType(), True),
    StructField("humidity", DoubleType(), True),
    StructField("precipMM", DoubleType(), True),
    StructField("pressure", DoubleType(), True),
    StructField("tempC", DoubleType(), True),
    StructField("visibility", DoubleType(), True),
    StructField("winddirDegree", DoubleType(), True),
    StructField("windspeedKmph", DoubleType(), True),
    StructField("City", StringType(), True)
])

df_test = spark.createDataFrame(data=test_data, schema=schema)
df_test = df_test.withColumn("date_time", F.to_timestamp("date_time", "yyyy-MM-dd HH:mm"))

# -------------------------------
# Bronze Layer Tests
# -------------------------------
try:
    bronze_table = "weather_catalog.raw.weather_bronze"
    df_bronze = spark.table(bronze_table)
    assert set(df_bronze.columns).issuperset([f.name for f in schema.fields])
    msg = " Bronze layer schema validation passed"
    print(msg)
    send_slack_message(msg, level="INFO")
except AssertionError:
    msg = " Bronze layer schema mismatch"
    print(msg)
    send_slack_message(msg, level="ERROR")
except AnalysisException:
    msg = f" Bronze table {bronze_table} not found"
    print(msg)
    send_slack_message(msg, level="ERROR")



In [0]:
# -------------------------------
# Silver Layer Tests
# -------------------------------
try:
    silver_table = "weather_catalog.raw.silver_table"
    df_silver = spark.table(silver_table)

    # Null checks
    null_count = df_silver.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c) 
        for c in ["maxtempC", "mintempC", "tempC", "City"]
    ])
    print("Null Check Results (Silver Layer):")
    null_count.show()
    send_slack_message(" Silver layer null check completed", level="INFO")

    # Invalid test: no negative temperatures allowed
    invalid_temp = df_silver.filter(F.col("maxtempC") < -100).count()
    assert invalid_temp == 0, "Invalid extreme temperatures found"
    print(" Silver layer extreme temp validation passed")

    # Invalid test: city names must not be null
    null_cities = df_silver.filter(F.col("City").isNull()).count()
    assert null_cities == 0, "City column contains nulls"
    print(" Silver layer city null validation passed")

except AssertionError as e:
    msg = f" Silver layer validation failed: {str(e)}"
    print(msg)
    send_slack_message(msg, level="ERROR")
except AnalysisException:
    msg = f"Silver table {silver_table} not found"
    print(msg)
    send_slack_message(msg, level="ERROR")



In [0]:
# -------------------------------
# Gold Layer Tests (Aggregations)
# -------------------------------
try:
    gold_weekly_table = "weather_catalog.gold.weather_weekly"
    df_gold_weekly = spark.table(gold_weekly_table)

    expected_cols = ["maxtempC_avg", "maxtempC_max", "maxtempC_min"]
    assert all(col in df_gold_weekly.columns for col in expected_cols)

    # Invalid test: average temperature must be within realistic range
    invalid_avg = df_gold_weekly.filter(
        (F.col("maxtempC_avg") < -100) | (F.col("maxtempC_avg") > 70)
    ).count()
    assert invalid_avg == 0, "Unrealistic average temperatures found"

    msg = " Gold layer weekly aggregation validation passed"
    print(msg)
    send_slack_message(msg, level="INFO")

except AssertionError as e:
    msg = f" Gold weekly aggregation validation failed: {str(e)}"
    print(msg)
    send_slack_message(msg, level="ERROR")
except AnalysisException:
    msg = f" Gold table {gold_weekly_table} not found"
    print(msg)
    send_slack_message(msg, level="ERROR")

# -------------------------------
# Citywise Insights Tests
# -------------------------------
try:
    citywise_table = "weather_catalog.gold.citywise_stats"
    df_citywise = spark.table(citywise_table)
    assert "City" in df_citywise.columns and "maxtempC_max" in df_citywise.columns

    # Invalid test: each city must have at least one record
    missing_city = df_citywise.filter(F.col("City").isNull()).count()
    assert missing_city == 0, "Citywise stats contain null City values"

    msg = " Citywise Gold stats validation passed"
    print(msg)
    send_slack_message(msg, level="INFO")

except AssertionError as e:
    msg = f" Citywise validation failed: {str(e)}"
    print(msg)
    send_slack_message(msg, level="ERROR")
except AnalysisException:
    msg = f" Citywise table {citywise_table} not found"
    print(msg)
    send_slack_message(msg, level="ERROR")

# -------------------------------
# Display Test Data & Results
# -------------------------------
print("\n=== Test Data ===")
display(df_test)

print("\n=== Silver Table Sample ===")
try:
    display(df_silver)
except:
    print("Silver table not available")

print("\n=== Gold Weekly Table Sample ===")
try:
    display(df_gold_weekly)
except:
    print("Gold weekly table not available")

print("\n=== Citywise Table Sample ===")
try:
    display(df_citywise)
except:
    print("Citywise table not available")

final_msg = " All Medallion ETL layer tests completed"
print(final_msg)
send_slack_message(final_msg, level="INFO")
