In [0]:
# Import required libraries
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
%run /Workspace/Users/pavansaikandi@gmail.com/unified_pipeline/1_setup/02_config

In [0]:
# Verify project utilities
print(gold_schema, silver_schema, bronze_schema)

In [0]:
# Initialize Notebook Widgets
dbutils.widgets.text("catalog", "fmcg", "Catalog")
dbutils.widgets.text("data_source", "products", "Data Source")

In [0]:
catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

base_path = f's3://vitality-nutrition/{data_source}/*.csv'
print(base_path)

# Bronze

In [0]:
# Read data from the s3 bucket
df = (
    spark.read
    .format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load(base_path)
    .withColumn("ingest_date", F.current_timestamp())
    .select("*", "_metadata.file_name", "_metadata.file_size")
)

In [0]:
# Check sample data
display(df.limit(10))

In [0]:
# Check Schema of the data
df.printSchema()

In [0]:
# Create a table into Bronze Schema
df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("delta.enableChangeDataFeed", "true") \
    .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

# Silver

In [0]:
# Verify the data in the table
df_bronze = spark.sql(f"SELECT * FROM {catalog}.{bronze_schema}.{data_source}")
display(df_bronze.limit(10))

In [0]:
# Verify and remove Duplicates
df_duplicate = df_bronze.groupBy("product_id").count().filter("count > 1")
display(df_duplicate)

In [0]:
print('Rows before duplicates dropped: ', df_bronze.count())
df_silver = df_bronze.dropDuplicates(['product_id'])
print('Rows after duplicates dropped: ', df_silver.count())

In [0]:
# Fixing Title Case of Category
df_silver.select("category").distinct().show()

In [0]:
df_silver = df_silver.withColumn(
    "category", 
    F.when(F.col("category").isNull(), None)
    .otherwise(F.initcap(F.col("category")))
)
                                 

In [0]:
# Sanity check
df_silver.select("category").distinct().show()

In [0]:
# Replace 'protien' â†’ 'protein' in both product_name and category
df_silver = (
    df_silver
    .withColumn(
        "product_name",
        F.regexp_replace(F.col("product_name"), "(?i)Protien", "Protein")
    )
    .withColumn(
        "category",
        F.regexp_replace(F.col("category"), "(?i)Protien", "Protein")
    )
)

In [0]:
display(df_silver.limit(10))

In [0]:
# Standardizing Customer Attributes to Match Parent Company Data Model

df_silver = (
    df_silver
    .withColumn(
        "division",
        F.when(F.col("category") == "Energy Bars",        "Nutrition Bars")
         .when(F.col("category") == "Protein Bars",       "Nutrition Bars")
         .when(F.col("category") == "Granola & Cereals",  "Breakfast Foods")
         .when(F.col("category") == "Recovery Dairy",     "Dairy & Recovery")
         .when(F.col("category") == "Healthy Snacks",     "Healthy Snacks")
         .when(F.col("category") == "Electrolyte Mix",    "Hydration & Electrolytes")
         .otherwise("Other")
    )
)

display(df_silver.limit(10))

In [0]:
df_silver = df_silver.withColumn(
    "variant",
    F.regexp_extract(F.col("product_name"), r"\((.*?)\)", 1)
)

display(df_silver.limit(10))

In [0]:
df_silver = (
    df_silver
    # 1. Generate deterministic product_code
    # Optimization: Handle NULLs in product_name so the hash is never NULL
    .withColumn(
        "product_code",
        F.sha2(F.coalesce(F.col("product_name"), F.lit("n/a")), 256)
    )
    # 2. Clean product_id
    .withColumn(
        "product_id",
        F.when(
            # Validate it contains only digits
            F.col("product_id").cast("string").rlike("^[0-9]+$"),
            F.col("product_id").cast("string")
        ).otherwise(F.lit("999999")) # Store fallback as string to match column type
    )
    # 3. Rename
    .withColumnRenamed("product_name", "product")
)

In [0]:
df_silver = df_silver.select("product_code", "division", "category", "product", "variant", "product_id", "ingest_date", "file_name", "file_size")

display(df_silver.limit(10))

In [0]:
# Create a table into Silver Schema
df_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .option("delta.enableChangeDataFeed", "true") \
    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

# Gold

In [0]:
# Take required columns only and create a table in Gold Schema
df_gold = df_silver.select("product_code", "product_id", "division", "category", "product", "variant")

df_gold.write \
    .format("delta") \
    .mode("overwrite") \
    .option("delta.enableChangeDataFeed", "true") \
    .saveAsTable(f"{catalog}.{gold_schema}.vn_dim_{data_source}")

In [0]:
# Merging the data source with Parent
delta_table = DeltaTable.forName(spark, "fmcg.gold.dim_products")

df_child_products = spark.sql(f"SELECT product_code, division, category, product, variant FROM fmcg.gold.vn_dim_products;")

In [0]:
delta_table.alias("target").merge(
    source=df_child_products.alias("source"),
    condition="target.product_code = source.product_code"
).whenMatchedUpdate(
    set={
        "division": "source.division",
        "category": "source.category",
        "product": "source.product",
        "variant": "source.variant"
    }
).whenNotMatchedInsert(
    values={
        "product_code": "source.product_code",
        "division": "source.division",
        "category": "source.category",
        "product": "source.product",
        "variant": "source.variant"
    }
).execute()