In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
%sql
USE CATALOG atliq_project;

In [0]:
monthlysale_df = spark.read.format("parquet").load("abfss://bronze@adlsdbspractice.dfs.core.windows.net/atliqsale/fact_sales_monthly.parquet")

In [0]:
monthlysale_df.display()

date,division,category,product_code,product,market,platform,channel,customer_code,customer_name,sold_quantity
2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,Brick & Mortar,Direct,70008169,AltiQ Exclusive,81
2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,Brick & Mortar,Retailer,90008165,Forward Stores,157
2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,Brick & Mortar,Retailer,90008166,Sound,126
2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,Brick & Mortar,Retailer,90008167,Electricalsocity,160
2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Australia,E-Commerce,Direct,70008170,Atliq e Store,120
2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Brazil,E-Commerce,Retailer,90027207,Amazon,9
2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Canada,Brick & Mortar,Direct,70023031,AltiQ Exclusive,9
2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Canada,Brick & Mortar,Retailer,90023022,Nomad Stores,24
2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Canada,Brick & Mortar,Retailer,90023025,Premium Stores,22
2017-09-01,N & S,External Solid State Drives,A6218160101,AQ Digit SSD,Canada,Brick & Mortar,Retailer,90023026,Relief,37


Cleaning date column

In [0]:
# Performing date transformation on the date column of monthlysale_df
monthlysale_df = monthlysale_df.withColumn("date", to_date(col("date")))

Deduplication

In [0]:
monthlysale_df = monthlysale_df.dropDuplicates(["date", "division", "category", "product_code", "product", "market", "platform", "channel", "customer_code", "customer_name", "sold_quantity"])

Data quality check - If data are bad quality, go to quarantine table

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS staging;
CREATE SCHEMA IF NOT EXISTS silver;

In [0]:
%sql
-- Create a clean market table
CREATE TABLE IF NOT EXISTS staging.monthlysale
 (date DATE,
  division STRING,
  category STRING,
  product_code STRING,
  product STRING,
  market STRING,
  platform STRING,
  channel STRING,
  customer_code STRING,
  customer_name STRING,
  sold_quantity INTEGER)
USING DELTA;

-- create a quarantine market table
CREATE TABLE IF NOT EXISTS staging.quarantined_monthlysale
 (date DATE,
  division STRING,
  category STRING,
  product_code STRING,
  product STRING,
  market STRING,
  platform STRING,
  channel STRING,
  customer_code STRING,
  customer_name STRING,
  sold_quantity INTEGER)
USING DELTA;

In [0]:
#Flag Violations - if any value of the columns are null, flag as 1
flag_monthlysale_df = monthlysale_df.withColumn("quarantine_check",\
                             when((col("date").isNull() | col("division").isNull() | \
                             col("category").isNull() | col("product_code").isNull() | \
                             col("product").isNull() | col("market").isNull() | \
                             col("platform").isNull() | col("channel").isNull() | \
                             col("customer_code").isNull() | col("customer_name").isNull()), lit("1")).otherwise(lit("0")))

In [0]:
flag_monthlysale_df.createOrReplaceTempView("flag_monthlysale_df")

In [0]:
%sql
-- insert bad records into the quarantine monthly sale table
INSERT INTO staging.quarantined_monthlysale
 SELECT date, division, category, product_code, product, market, platform, channel, customer_code, customer_name, sold_quantity
 FROM flag_monthlysale_df 
 WHERE quarantine_check = 1;
-- insert good records into monthly sale table
INSERT INTO staging.monthlysale
 SELECT date, division, category, product_code, product, market, platform, channel, customer_code, customer_name, sold_quantity
 FROM flag_monthlysale_df 
 WHERE quarantine_check = 0;

num_affected_rows,num_inserted_rows
1425706,1425706


Write clean monthly sale table into silver layer

In [0]:
%sql
-- Create silver monthly sale table, partitioned by date
CREATE TABLE IF NOT EXISTS silver.monthlysales (
  date DATE,
  division STRING,
  category STRING,
  product_code STRING,
  product STRING,
  market STRING,
  platform STRING,
  channel STRING,
  customer_code STRING,
  customer_name STRING,
  sold_quantity INTEGER,
  inserted_date TIMESTAMP)
USING DELTA
PARTITIONED BY (date);

In [0]:
%sql
INSERT INTO silver.monthlysales
( date, 
  division, 
  category, 
  product_code, 
  product, 
  market, 
  platform, 
  channel, 
  customer_code, 
  customer_name, 
  sold_quantity,
  inserted_date
)
SELECT
  source.date,
  source.division,
  source.category,
  source.product_code,
  source.product,
  source.market,
  source.platform,
  source.channel,
  source.customer_code,
  source.customer_name,
  source.sold_quantity,
-- Set inserted date is current timestamp
  current_timestamp()
FROM staging.monthlysale AS source

num_affected_rows,num_inserted_rows
1425706,1425706
