In [0]:
source_path = "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv"

In [0]:
# Clean up previous widgets
dbutils.widgets.removeAll()

In [0]:
# Add widgets for parameters

# Create a widget for the source file path
dbutils.widgets.text("source_path", "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv", "Source File Path")


# Create a dropdown to select layer
dbutils.widgets.dropdown("layer", "bronze", ["bronze","silver","gold"], "Select Processing Layer")

In [0]:
# Use parameters
source = dbutils.widgets.get("source_path")
layer = dbutils.widgets.get("layer")

In [0]:
print(f"Job started for layer is: {layer}")
print(f"The source file path is: {source}")

Job started for layer is: bronze
The source file path is: /Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv


In [0]:
from pyspark.sql import functions as F

def process_bronze():
    print("Running bronze layer...")
    raw_df = spark.read.csv(source, header=True, inferSchema=True)
    raw_df.withColumn("ingestion_ts", F.current_timestamp()) \
        .write.format("delta").mode("overwrite").saveAsTable("bronze_events")
    return "Bronze success"

def process_silver():
    print("Running silver layer...")
    bronze_df = spark.read.table("bronze_events")

    silver_df = bronze_df.filter(F.col("price") > 0) \
        .dropDuplicates(["user_session", "event_time"]) \
        .withColumn("product_name", F.coalesce(F.element_at(F.split(F.col("category_code"), "."), -1), F.lit("Other")))
    silver_df.write.format("delta").mode("overwrite").saveAsTable("silver_events")
    return "Silver success"
def process_gold():
    print("Running gold layer...")
    silver_df = spark.read.table("silver_events")

    gold_df = silver_df.groupBy("product_id", "product_name").agg(F.sum("price").alias("revenue"))
    gold_df.write.format("delta").mode("overwrite").saveAsTable("gold_events")
    return "Gold success"


In [0]:
# UI: Create Job
# Task 1: bronze_layer (notebook)
# Task 2: silver_layer (depends on Task 1)
# Task 3: gold_layer (depends on Task 2)
# Schedule: Daily 2 AM