In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
%sql
USE CATALOG atliq_project;

Read customer dataframe 

In [0]:
customer_df = spark.read.format("parquet").load("abfss://bronze@adlsdbspractice.dfs.core.windows.net/atliqsale/dim_customer.parquet")

Cleaning column customer

In [0]:
#Trimming whitespace from customer column
customer_df = customer_df.withColumn('customer', trim(col('customer')))

Deduplicate

In [0]:
customer_df = customer_df.dropDuplicates(["customer", "market", "platform", "channel", "customer_code"])

Data quality check - If data are bad quality, go to quarantine table

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS staging;
CREATE SCHEMA IF NOT EXISTS silver;

In [0]:
%sql
-- Create a clean customer table
CREATE TABLE IF NOT EXISTS staging.customer
 (customer_code STRING,
  customer STRING,
  market STRING,
  platform STRING,
  channel STRING)
USING DELTA;

-- create a quarantine customer table
CREATE TABLE IF NOT EXISTS staging.quarantined_customer
 (customer_code STRING,
  customer STRING,
  market STRING,
  platform STRING,
  channel STRING)
USING DELTA;

In [0]:
#Flag Violations - if any value of the columns are null, flag as 1
flag_customer_df = customer_df.withColumn("quarantine_check",\
                             when((col("customer_code").isNull() | col("customer").isNull() | \
                             col("market").isNull() | col("platform").isNull() | \
                             col("channel").isNull()), lit("1")).otherwise(lit("0")))

In [0]:
flag_customer_df.createOrReplaceTempView("flag_customer_df")

In [0]:
%sql
-- insert bad records into the quarantine customer table
INSERT INTO staging.quarantined_customer
 SELECT customer_code, customer, market, platform, channel
 FROM flag_customer_df 
 WHERE quarantine_check = 1;
-- insert good records into customer table
INSERT INTO staging.customer
 SELECT customer_code, customer, market, platform, channel 
 FROM flag_customer_df 
 WHERE quarantine_check = 0;

num_affected_rows,num_inserted_rows
209,209


SCD type 2 for customer 

In [0]:
%sql
-- SCD type 2 for customers in silver layer
CREATE TABLE IF NOT EXISTS silver.customers (
        customer_code STRING,
        customer STRING,
        market STRING,
        platform STRING,
        channel STRING,
        inserted_date TIMESTAMP,
        modified_date TIMESTAMP,
        is_current BOOLEAN
    ) USING DELTA;

In [0]:
%sql 
-- Step 1: Mark existing records as historical (is_current = false and modified_date = current_timestamp) for customer that will be updated
MERGE INTO silver.customers AS target 
USING staging.customer AS source 
ON target.customer_code = source.customer_code
WHEN MATCHED
AND (
    target.customer <> source.customer
    OR target.market <> source.market
    OR target.platform <> source.platform
    OR target.channel <> source.channel
)
AND target.is_current = true
THEN
UPDATE
SET target.is_current = false,
    target.modified_date = current_timestamp()
WHEN NOT MATCHED THEN
INSERT (
    customer_code,
    customer,
    market,
    platform,
    channel,
    inserted_date,
    modified_date,
    is_current
    )
VALUES (
        source.customer_code,
        source.customer,
        source.market,
        source.platform,
        source.channel,
        -- Set inserted_date to current timestamp
        current_timestamp(),
        -- Set modified_date to really far timeline
        '9999-12-31',
        -- Mark as this is current value
        true 
    );

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
209,0,0,209


In [0]:
%sql
-- Step 2: Insert new updated records into the silver layer with is_current = true 
INSERT INTO silver.customers
(
  customer_code,
  customer,
  market,
  platform,
  channel,
  inserted_date,
  modified_date,
  is_current
)
SELECT
  source.customer_code,
  source.customer,
  source.market,
  source.platform,
  source.channel,
-- Set inserted date is current timestamp
  current_timestamp(),
-- Set modified date is really far away
  '9999-12-31',
-- Set is_current to true
  true
FROM staging.customer AS source
INNER JOIN silver.customers AS target 
ON target.customer_code = source.customer_code
WHERE target.is_current = false

num_affected_rows,num_inserted_rows
0,0
