Environment

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import dlt

# Environment configuration
env = spark.conf.get("pipeline.env") or "dev"  # Default to dev if not set
catalog = "book_rec_catalog"

# Schema names
bronze_schema = f"{env}_bronze"
silver_schema = f"{env}_silver"

Silver layer

In [0]:
# Simple Silver layer - just clean the data
@dlt.table(name=f"{silver_schema}.books_silver", comment="Silver table for cleaned Books data")
@dlt.expect("valid_isbn", "ISBN IS NOT NULL AND ISBN != ''")
@dlt.expect("valid_book_title", "`Book-Title` IS NOT NULL AND `Book-Title` != ''")
@dlt.expect("valid_book_author", "`Book-Author` IS NOT NULL AND `Book-Author` != ''")
def books_silver():
    return (spark.table(f"{bronze_schema}.books_bronze")
            .withColumn("Year-Of-Publication", 
                       when(col("Year-Of-Publication").cast("int").isNotNull(), 
                            col("Year-Of-Publication").cast("int"))
                       .otherwise(None))
            .withColumn("Publisher", 
                       when(col("Publisher").isNotNull() & (col("Publisher") != ""), 
                            trim(col("Publisher")))
                       .otherwise("Unknown"))
            .withColumn("Book-Title", trim(col("Book-Title")))
            .withColumn("Book-Author", trim(col("Book-Author")))
            .withColumn("ISBN", trim(col("ISBN")))
            .withColumn("cleaned_ts", current_timestamp())
    )

@dlt.table(name=f"{silver_schema}.ratings_silver", comment="Silver table for cleaned Ratings data")
@dlt.expect("valid_user_id", "`User-ID` IS NOT NULL")
@dlt.expect("valid_isbn", "ISBN IS NOT NULL AND ISBN != ''")
@dlt.expect("valid_book_rating", "`Book-Rating` IS NOT NULL AND `Book-Rating` >= 0 AND `Book-Rating` <= 10")
def ratings_silver():
    return (spark.table(f"{bronze_schema}.ratings_bronze")
            .withColumn("Book-Rating", 
                       when(col("Book-Rating").cast("int").isNotNull(), 
                            col("Book-Rating").cast("int"))
                       .otherwise(None))
            .withColumn("ISBN", trim(col("ISBN")))
            .withColumn("cleaned_ts", current_timestamp())
    )

@dlt.table(name=f"{silver_schema}.users_silver", comment="Silver table for cleaned Users data")
@dlt.expect("valid_user_id", "`User-ID` IS NOT NULL")
def users_silver():
    return (spark.table(f"{bronze_schema}.users_bronze")
            .withColumn("Age", 
                       when(col("Age").cast("int").isNotNull(), 
                            col("Age").cast("int"))
                       .otherwise(None))
            .withColumn("Location", 
                       when(col("Location").isNotNull() & (col("Location") != ""), 
                            trim(col("Location")))
                       .otherwise("Unknown"))
            # Split Location into city, region, state with proper handling
            .withColumn("location_cleaned", 
                       when(col("Location") != "Unknown",
                            # Replace / with , but preserve n/a
                            regexp_replace(col("Location"), r'(?<!n)/(?!a)', ", "))
                       .otherwise("Unknown"))
            .withColumn("location_parts", 
                       when(col("location_cleaned") != "Unknown",
                            split(col("location_cleaned"), ","))
                       .otherwise(array(lit("Unknown"))))
            .withColumn("city", 
                       when((size(col("location_parts")) >= 1) & 
                            (trim(element_at(col("location_parts"), 1)) != ""), 
                            initcap(trim(element_at(col("location_parts"), 1))))
                       .otherwise("Unknown"))
            .withColumn("region", 
                       when((size(col("location_parts")) >= 2) & 
                            (trim(element_at(col("location_parts"), 2)) != ""), 
                            initcap(trim(element_at(col("location_parts"), 2))))
                       .otherwise("Unknown"))
            .withColumn("state", 
                       when((size(col("location_parts")) >= 3) & 
                            (trim(element_at(col("location_parts"), 3)) != ""), 
                            initcap(trim(element_at(col("location_parts"), 3))))
                       .otherwise("Unknown"))
            .drop("location_parts", "location_cleaned")  # Remove temporary columns
            .withColumn("cleaned_ts", current_timestamp())
    )

# Quarantine tables for problematic data
@dlt.table(name=f"{silver_schema}.books_quarantine", comment="Quarantine table for problematic Books data")
def books_quarantine():
    return (spark.table(f"{bronze_schema}.books_bronze")
            .filter((col("ISBN").isNull()) | (col("ISBN") == "") |
                   (col("Book-Title").isNull()) | (col("Book-Title") == "") |
                   (col("Book-Author").isNull()) | (col("Book-Author") == ""))
            .withColumn("quarantine_reason", 
                       when((col("ISBN").isNull()) | (col("ISBN") == ""), "Missing ISBN")
                       .when((col("Book-Title").isNull()) | (col("Book-Title") == ""), "Missing Title")
                       .when((col("Book-Author").isNull()) | (col("Book-Author") == ""), "Missing Author")
                       .otherwise("Multiple Issues"))
            .withColumn("quarantine_ts", current_timestamp())
    )

@dlt.table(name=f"{silver_schema}.ratings_quarantine", comment="Quarantine table for problematic Ratings data")
def ratings_quarantine():
    return (spark.table(f"{bronze_schema}.ratings_bronze")
            .filter((col("User-ID").isNull()) |
                   (col("ISBN").isNull()) | (col("ISBN") == "") |
                   (col("Book-Rating").isNull()) |
                   (col("Book-Rating") < 0) | (col("Book-Rating") > 10))
            .withColumn("quarantine_reason", 
                       when(col("User-ID").isNull(), "Missing User ID")
                       .when((col("ISBN").isNull()) | (col("ISBN") == ""), "Missing ISBN")
                       .when(col("Book-Rating").isNull(), "Missing Rating")
                       .when((col("Book-Rating") < 0) | (col("Book-Rating") > 10), "Invalid Rating")
                       .otherwise("Multiple Issues"))
            .withColumn("quarantine_ts", current_timestamp())
    )

@dlt.table(name=f"{silver_schema}.users_quarantine", comment="Quarantine table for problematic Users data")
def users_quarantine():
    return (spark.table(f"{bronze_schema}.users_bronze")
            .filter(col("User-ID").isNull())
            .withColumn("quarantine_reason", lit("Missing User ID"))
            .withColumn("quarantine_ts", current_timestamp())
    )