In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
%sql
USE CATALOG atliq_project;

Read product data from bronze layer

In [0]:
product_df = spark.read.format("parquet").load("abfss://bronze@adlsdbspractice.dfs.core.windows.net/atliqsale/dim_product.parquet")

Deduplication

In [0]:
product_df = product_df.dropDuplicates(["product_code", "division", "segment", "category", "product", "variant"])

Data quality check - If data are bad quality, go to quarantine table

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS staging;
CREATE SCHEMA IF NOT EXISTS silver;

In [0]:
%sql
-- Create a clean market table
CREATE TABLE IF NOT EXISTS staging.product
 (product_code STRING,
  division STRING,
  segment STRING,
  category STRING,
  product STRING,
  variant STRING)
USING DELTA;

-- create a quarantine market table
CREATE TABLE IF NOT EXISTS staging.quarantined_product
 (product_code STRING,
  division STRING,
  segment STRING,
  category STRING,
  product STRING,
  variant STRING)
USING DELTA;

In [0]:
#Flag Violations - if any value of the columns are null, flag as 1
flag_product_df = product_df.withColumn("quarantine_check",\
                             when((col("product_code").isNull() | col("division").isNull() | \
                             col("segment").isNull() | col("category").isNull() | col("product").isNull() | col("variant").isNull()), lit("1")).otherwise(lit("0")))

In [0]:
flag_product_df.createOrReplaceTempView("flag_product_df")

In [0]:
%sql
-- insert bad records into the quarantine product table
INSERT INTO staging.quarantined_product
 SELECT product_code, division, segment, category, product, variant
 FROM flag_product_df 
 WHERE quarantine_check = 1;
-- insert good records into product table
INSERT INTO staging.product
 SELECT product_code, division, segment, category, product, variant
 FROM flag_product_df 
 WHERE quarantine_check = 0;

num_affected_rows,num_inserted_rows
397,397


SCD type 2 for product

In [0]:
%sql
-- SCD type 2 for products in silver layer
CREATE TABLE IF NOT EXISTS silver.products (
        product_code STRING,
        division STRING,
        segment STRING,
        category STRING, 
        product STRING, 
        variant STRING,
        inserted_date TIMESTAMP,
        modified_date TIMESTAMP,
        is_current BOOLEAN
    ) USING DELTA;

In [0]:
%sql 
-- Step 1: Mark existing records as historical (is_current = false and modified_date = current_timestamp) for product that will be updated
MERGE INTO silver.products AS target 
USING staging.product AS source 
ON target.product_code = source.product_code
WHEN MATCHED
AND (
    target.division <> source.division
    OR target.segment <> source.segment
    OR target.category <> source.category
    OR target.product <> source.product
    OR target.variant <> source.variant
)
AND target.is_current = true
THEN
UPDATE
SET target.is_current = false,
    target.modified_date = current_timestamp()
WHEN NOT MATCHED THEN
INSERT (
    product_code,
    division,
    segment,
    category,
    product,
    variant,
    inserted_date,
    modified_date,
    is_current
    )
VALUES (
        source.product_code,
        source.division,
        source.segment,
        source.category,
        source.product,
        source.variant,
        -- Set inserted_date to current timestamp
        current_timestamp(),
        -- Set modified_date to really far timeline
        '9999-12-31',
        -- Mark as this is current value
        true 
    );

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
397,0,0,397


In [0]:
%sql
-- Step 2: Insert new updated records into the silver layer with is_current = true 
INSERT INTO silver.products
(
  product_code,
  division,
  segment,
  category,
  product,
  variant,
  inserted_date,
  modified_date,
  is_current
)
SELECT
  source.product_code,
  source.division,
  source.segment,
  source.category,
  source.product,
  source.variant,
-- Set inserted date is current timestamp
  current_timestamp(),
-- Set modified date is really far away
  '9999-12-31',
-- Set is_current to true
  true
FROM staging.product AS source
INNER JOIN silver.products AS target 
ON target.product_code = source.product_code
WHERE target.is_current = false

num_affected_rows,num_inserted_rows
0,0
