In [0]:
dbutils.fs.rm("dbfs:/user/hive/warehouse/global_retail_silver.db/silver_customers", recurse=True)

Out[1]: True

In [0]:
spark.sql("USE global_retail_silver")
spark.sql("""
    CREATE TABLE IF NOT EXISTS silver_customers (
    customer_id STRING,
    name STRING,
    email STRING,
    country STRING,
    customer_type STRING,
    registration_date DATE,
    age INT,
    gender STRING,
    total_purchases INT,
    customer_segment STRING,
    days_since_registration INT,
    last_updated TIMESTAMP)
""")

Out[2]: DataFrame[]

In [0]:
# Get the last processed timestamp from silver layer
last_processed_df = spark.sql("SELECT MAX(last_updated) as last_processed FROM silver_customers")
last_processed_timestamp = last_processed_df.collect()[0]['last_processed']

if last_processed_timestamp is None:
    last_processed_timestamp = "1900-01-01T00:00:00.000+00:00"

In [0]:
# Create a temporary view of incremental bronze data
spark.sql(f"""
CREATE OR REPLACE TEMPORARY VIEW bronze_incremental AS
SELECT *
FROM global_retail_bronze.bronze_customer c where  c.ingestion_timestamp > '{last_processed_timestamp}'
""")

Out[4]: DataFrame[]

In [0]:
spark.sql("select * from bronze_incremental").show()

+-----------+-----------+--------------------+---------+-------------+-----------------+---+------+---------------+--------------------+
|customer_id|       name|               email|  country|customer_type|registration_date|age|gender|total_purchases| ingestion_timestamp|
+-----------+-----------+--------------------+---------+-------------+-----------------+---+------+---------------+--------------------+
|          1| Customer 1|customer1@example...|Australia|      Regular|       2011-05-15| 22|  Male|            191|2025-04-29 11:49:...|
|          2| Customer 2|customer2@example...|   France|      Premium|       2018-11-27| 52| Other|            145|2025-04-29 11:49:...|
|          3| Customer 3|customer3@example...|   Canada|      Premium|       2015-10-01| 32| Other|            691|2025-04-29 11:49:...|
|          4| Customer 4|customer4@example...|      USA|      Premium|       2011-01-19| 70| Other|            644|2025-04-29 11:49:...|
|          5| Customer 5|customer5@exampl

In [0]:
#Validate email addresses (null or not null)
#Valid age between 18 to 100
#Create customer_segment as total_purchases > 10000 THEN 'High Value' if > 5000 THEN 'Medium Value'  ELSE 'Low Value'
#days since user is registered in the system
#Remove any junk records where total_purchase is negative number


In [0]:
%sql
CREATE OR REPLACE TEMPORARY VIEW silver_incremental AS(
  SELECT
  customer_id,
  name,
  email,
  country,
  customer_type,
  registration_date,
  age,
  gender,
  total_purchases,
  CASE 
    WHEN total_purchases > 700 then 'High Value'
    WHEN total_purchases > 400 then 'Mid Value'
    ELSE 'Low Value'
  END AS customer_segment,
  date_diff(current_date(),registration_date) as days_since_registration,
  current_timestamp() AS last_updated
  from bronze_incremental
  where 
    email IS NOT NULL AND
    age BETWEEN 18 AND 100 AND
    total_purchases >=0
)

In [0]:
%sql
SELECT * FROM silver_incremental limit 10

customer_id,name,email,country,customer_type,registration_date,age,gender,total_purchases,customer_segment,days_since_registration,last_updated
1,Customer 1,customer1@example.com,Australia,Regular,2011-05-15,22,Male,191,Low Value,5098,2025-04-29T13:54:48.460+0000
2,Customer 2,customer2@example.com,France,Premium,2018-11-27,52,Other,145,Low Value,2345,2025-04-29T13:54:48.460+0000
3,Customer 3,customer3@example.com,Canada,Premium,2015-10-01,32,Other,691,Mid Value,3498,2025-04-29T13:54:48.460+0000
4,Customer 4,customer4@example.com,USA,Premium,2011-01-19,70,Other,644,Mid Value,5214,2025-04-29T13:54:48.460+0000
5,Customer 5,customer5@example.com,Germany,Regular,2021-08-26,66,Other,508,Mid Value,1342,2025-04-29T13:54:48.460+0000
6,Customer 6,customer6@example.com,France,Premium,2015-03-02,20,Male,704,High Value,3711,2025-04-29T13:54:48.460+0000
7,Customer 7,customer7@example.com,China,Premium,2018-05-24,24,Female,892,High Value,2532,2025-04-29T13:54:48.460+0000
8,Customer 8,customer8@example.com,China,Regular,2023-10-02,26,Male,488,Mid Value,575,2025-04-29T13:54:48.460+0000
9,Customer 9,customer9@example.com,Japan,Premium,2014-10-05,36,Other,30,Low Value,3859,2025-04-29T13:54:48.460+0000
10,Customer 10,customer10@example.com,Brazil,Premium,2017-08-30,30,Male,959,High Value,2799,2025-04-29T13:54:48.460+0000


In [0]:
%sql
MERGE INTO silver_customers AS target
USING silver_incremental AS source
ON target.customer_id = source.customer_id
WHEN MATCHED THEN
  UPDATE SET *
WHEN NOT MATCHED THEN
  INSERT *

num_affected_rows,num_updated_rows,num_deleted_rows,num_inserted_rows
930,0,0,930


In [0]:
%sql
select * from silver_customers limit 10

customer_id,name,email,country,customer_type,registration_date,age,gender,total_purchases,customer_segment,days_since_registration,last_updated
1,Customer 1,customer1@example.com,Australia,Regular,2011-05-15,22,Male,191,Low Value,5098,2025-04-29T11:53:03.563+0000
2,Customer 2,customer2@example.com,France,Premium,2018-11-27,52,Other,145,Low Value,2345,2025-04-29T11:53:03.563+0000
3,Customer 3,customer3@example.com,Canada,Premium,2015-10-01,32,Other,691,Mid Value,3498,2025-04-29T11:53:03.563+0000
4,Customer 4,customer4@example.com,USA,Premium,2011-01-19,70,Other,644,Mid Value,5214,2025-04-29T11:53:03.563+0000
5,Customer 5,customer5@example.com,Germany,Regular,2021-08-26,66,Other,508,Mid Value,1342,2025-04-29T11:53:03.563+0000
6,Customer 6,customer6@example.com,France,Premium,2015-03-02,20,Male,704,High Value,3711,2025-04-29T11:53:03.563+0000
7,Customer 7,customer7@example.com,China,Premium,2018-05-24,24,Female,892,High Value,2532,2025-04-29T11:53:03.563+0000
8,Customer 8,customer8@example.com,China,Regular,2023-10-02,26,Male,488,Mid Value,575,2025-04-29T11:53:03.563+0000
9,Customer 9,customer9@example.com,Japan,Premium,2014-10-05,36,Other,30,Low Value,3859,2025-04-29T11:53:03.563+0000
10,Customer 10,customer10@example.com,Brazil,Premium,2017-08-30,30,Male,959,High Value,2799,2025-04-29T11:53:03.563+0000
