In [0]:
from pyspark.sql.functions import col, current_timestamp, lit, monotonically_increasing_id
import uuid
import datetime

# ----------------------------
# Config
# ----------------------------
bronze_table = "weather_catalog.raw.weather_bronze"
silver_table = "weather_catalog.processed.valid_readings"
dlq_table = "weather_catalog.processed.dlq_silver"
log_table = "weather_catalog.logging.silver_ingestion_logs"

batch_id = str(uuid.uuid4())

# ----------------------------
# Step 1: Read Bronze Data
# ----------------------------
df_bronze = spark.table(bronze_table)

# ----------------------------
# Step 2: Enrich with Metadata
# ----------------------------
df_enriched = df_bronze.withColumn("ingest_time", current_timestamp()) \
                       .withColumn("batch_id", lit(batch_id)) \
                       .withColumn("record_id", monotonically_increasing_id())

# ----------------------------
# Step 3: Validation / Error Handling
# Example: remove rows with null city or date_time
# You can add more validations as needed
# ----------------------------
df_dlq = df_enriched.filter(
    col("city").isNull() | col("date_time").isNull()
)

df_silver = df_enriched.filter(
    col("city").isNotNull() & col("date_time").isNotNull()
)

# Ensure the schema exists before writing tables
spark.sql("CREATE SCHEMA IF NOT EXISTS weather_catalog.logs")

# Step 4: Write Silver Data
df_silver.write.format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .saveAsTable(silver_table)

# Step 5: Write DLQ Data
if df_dlq.count() > 0:
    df_dlq.write.format("delta") \
        .mode("append") \
        .saveAsTable(dlq_table)

# Step 6: Logging / Audit
row_count_silver = df_silver.count()
row_count_dlq = df_dlq.count()
status = "SUCCESS" if row_count_silver > 0 else "FAILED"

log_df = spark.createDataFrame(
    [(batch_id, bronze_table, silver_table, row_count_silver, row_count_dlq, status, datetime.datetime.now())],
    ["batch_id", "source_table", "target_table", "rows_ingested", "rows_failed", "status", "ingest_time"]
)

log_df.write.format("delta").mode("append").saveAsTable(log_table)

print(f"✅ Batch {batch_id} Completed | Silver: {row_count_silver}, DLQ: {row_count_dlq}")

In [0]:
from pyspark.sql.functions import col, current_timestamp, lit, monotonically_increasing_id
import uuid
import datetime

# ----------------------------
# Config
# ----------------------------
bronze_table = "weather_catalog.raw.weather_bronze"
silver_table = "weather_catalog.processed.valid_readings"
dlq_table = "weather_catalog.processed.dlq_silver"
log_table = "weather_catalog.logging.silver_ingestion_logs"

batch_id = str(uuid.uuid4())

# ----------------------------
# Step 1: Read Bronze Data
# ----------------------------
df_bronze = spark.table(bronze_table)

print("✅ Bronze data loaded")
df_bronze.show(5, truncate=False)
df_bronze.printSchema()


In [0]:
from pyspark.sql.functions import sum

# Count nulls per column
null_counts = df_bronze.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_bronze.columns])
null_counts.show(truncate=False)

# Show rows with any null values
df_bronze.filter(
    " OR ".join([f"{c} IS NULL" for c in df_bronze.columns])
).display(10, truncate=False)

# Total rows
print(f"Total rows in Bronze table: {df_bronze.count()}")


In [0]:
# Count duplicates based on all columns
duplicate_count = df_bronze.count() - df_bronze.dropDuplicates().count()
print(f"Total duplicate rows: {duplicate_count}")

# Optional: Show duplicate rows
df_bronze.groupBy(df_bronze.columns).count().filter("count > 1").display(10, truncate=False)


In [0]:
# ----------------------------
# Step 2: Enrich with Metadata
# ----------------------------
df_enriched = df_bronze.withColumn("ingest_time", current_timestamp()) \
                       .withColumn("batch_id", lit(batch_id)) \
                       .withColumn("record_id", monotonically_increasing_id())

df_enriched.display(5)


In [0]:
# ----------------------------
# Step 3: Validation / Error Handling
# Example: remove rows with null city or date_time
# You can add more validations as needed
# ----------------------------
df_dlq = df_enriched.filter(
    col("city").isNull() | col("date_time").isNull()
)

df_silver = df_enriched.filter(
    col("city").isNotNull() & col("date_time").isNotNull()
)

print(f" Valid rows: {df_silver.count()}")
print(f" Rows sent to DLQ: {df_dlq.count()}")


In [0]:
# Example cleaning processes
# Remove duplicates from Silver
df_silver = df_silver.dropDuplicates()

# Trim string columns and remove leading/trailing spaces
from pyspark.sql.functions import trim

string_cols = [c.name for c in df_silver.schema.fields if c.dataType == "StringType" or "string" in str(c.dataType).lower()]
for c in string_cols:
    df_silver = df_silver.withColumn(c, trim(col(c)))

# Fill missing numeric values with 0
numeric_cols = [c.name for c in df_silver.schema.fields if "int" in str(c.dataType).lower() or "double" in str(c.dataType).lower()]
for c in numeric_cols:
    df_silver = df_silver.fillna({c: 0})

print("✅ Cleaning completed")


In [0]:
# Ensure the logging schema exists
spark.sql("CREATE SCHEMA IF NOT EXISTS weather_catalog.logging")

# Step 4: Write Silver Data
df_silver.write.format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .saveAsTable(silver_table)

# Step 5: Write DLQ Data
if df_dlq.count() > 0:
    df_dlq.write.format("delta") \
        .mode("append") \
        .saveAsTable(dlq_table)

print(f"✅ Data written | Silver: {df_silver.count()}, DLQ: {df_dlq.count()}")
