In [0]:
# Import required libraries
from pyspark.sql import functions as F
from delta.tables import DeltaTable
from pyspark.sql.window import Window

In [0]:
%run /Workspace/Users/pavansaikandi@gmail.com/unified_pipeline/1_setup/02_config

In [0]:
# Verify project utilities
print(gold_schema, silver_schema, bronze_schema)

In [0]:
# Initialize Notebook Widgets
dbutils.widgets.text("catalog", "fmcg", "Catalog")
dbutils.widgets.text("data_source", "gross_price", "Data Source")

In [0]:
catalog = dbutils.widgets.get("catalog")
data_source = dbutils.widgets.get("data_source")

base_path = f's3://vitality-nutrition/{data_source}/*.csv'
print(base_path)

# Bronze

In [0]:
# Read data from the s3 bucket
df = (
    spark.read
    .format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load(base_path)
    .withColumn("ingest_date", F.current_timestamp())
    .select("*", "_metadata.file_name", "_metadata.file_size")
)

In [0]:
# Check sample data
display(df.limit(10))

In [0]:
# Check Schema of the data
df.printSchema()

In [0]:
# Create a table into Bronze Schema
df.write \
    .format("delta") \
    .mode("overwrite") \
    .option("delta.enableChangeDataFeed", "true") \
    .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

# Silver

In [0]:
# Verify the data in the table
df_bronze = spark.sql(f"SELECT * FROM {catalog}.{bronze_schema}.{data_source}")
display(df_bronze.limit(10))

In [0]:
# Normalizing month field
df_bronze.select('month').distinct().show()

In [0]:
df_silver = df_bronze.withColumn(
    "month",
    F.coalesce(
        F.try_to_date(F.col("month"), "yyyy/MM/dd"),
        F.try_to_date(F.col("month"), "dd/MM/yyyy"),
        F.try_to_date(F.col("month"), "yyyy-MM-dd"),
        F.try_to_date(F.col("month"), "dd-MM-yyyy")
    )
)

In [0]:
display(df_silver.select("month").distinct())

In [0]:
# We are validating the gross_price column, converting only valid numeric values to double, fixing negative prices by making them positive, and replacing all non-numeric values with 0

df_silver = df_silver.withColumn(
    "gross_price",
    F.coalesce(
        F.abs(F.expr("try_cast(gross_price AS DOUBLE)")),
        F.lit(0.0)
    )
)

df_silver.show()

In [0]:
# We enrich the silver dataset by performing an inner join with the products table to fetch the correct product_code for each product_id.

df_products = spark.table("fmcg.silver.products") 
df_joined = df_silver.join(df_products.select("product_id", "product_code"), on="product_id", how="inner")
df_joined = df_joined.select("product_id", "product_code", "month", "gross_price", "ingest_date", "file_name", "file_size")

df_joined.show(5)

In [0]:
# Create a table into Silver Schema
df_joined.write \
    .format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .option("delta.enableChangeDataFeed", "true") \
    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

# Gold

In [0]:
# Take required columns only and create a table in Gold Schema
df_silver = spark.sql(f"SELECT * FROM {catalog}.{silver_schema}.{data_source};")
df_gold = df_silver.select("product_code", "month", "gross_price")

df_gold.write \
    .format("delta") \
    .mode("overwrite") \
    .option("delta.enableChangeDataFeed", "true") \
    .saveAsTable(f"{catalog}.{gold_schema}.vn_dim_{data_source}")

In [0]:
df_gold_price = spark.table("fmcg.gold.vn_dim_gross_price")
df_gold_price.show(5)

In [0]:
df_gold_latest_price = (
    df_gold_price
    # Extract year from month
    .withColumn("year", F.year("month"))
    
    # Flag zero prices (0 = non-zero, 1 = zero â†’ non-zero first)
    .withColumn("is_zero", (F.col("gross_price") == 0).cast("int"))
    
    # Rank rows per product per year
    .withColumn(
        "rnk",
        F.row_number().over(
            Window
            .partitionBy("product_code", "year")
            .orderBy(F.col("is_zero"), F.col("month").desc())
        )
    )
    
    # Keep best row per product per year
    .filter(F.col("rnk") == 1)
)

In [0]:
display(df_gold_latest_price)

In [0]:
## Take required columns only

df_gold_latest_price = df_gold_latest_price.select("product_code", "year", "gross_price").withColumnRenamed("gross_price", "price_inr").select("product_code", "price_inr", "year")

# change year to string
df_gold_latest_price = df_gold_latest_price.withColumn("year", F.col("year").cast("string"))

df_gold_latest_price.show(5)

In [0]:
# Merging the data source with Parent
delta_table = DeltaTable.forName(spark, "fmcg.gold.dim_gross_price")

delta_table.alias("target").merge(
    source=df_gold_latest_price.alias("source"),
    condition="target.product_code = source.product_code"
).whenMatchedUpdate(
    set={
        "price_inr": "source.price_inr",
        "year": "source.year"
    }
).whenNotMatchedInsert(
    values={
        "product_code": "source.product_code",
        "price_inr": "source.price_inr",
        "year": "source.year"
    }
).execute()