# **Bronze to Silver Transformation â€” DimCustomer**

## Introduction
This notebook loads the AdventureWorks Customers dataset from the Bronze layer, performs data cleaning, standardization, and derives customer-level attributes. The resulting table is written to the Silver layer as a managed Delta table named `DimCustomer`.

In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

StatementMeta(, ea931cf2-8a02-484e-9924-ced42b6d850c, 3, Finished, Available, Finished)

In [2]:
# Load Bronze Data
df_customers_Bronze = \
    spark.read.format("csv")\
    .option("header", True)\
    .option("inferScheme", True)\
    .load("abfss://MonoWS@onelake.dfs.fabric.microsoft.com/MonoLH_Bronze.Lakehouse/Files/Raw/AdventureWorks_Customers/AdventureWorks_Customers.csv")


# Preview dtype and data
df_customers_Bronze.printSchema()
df_customers_Bronze.show()

StatementMeta(, ea931cf2-8a02-484e-9924-ced42b6d850c, 4, Finished, Available, Finished)

root
 |-- CustomerKey: string (nullable = true)
 |-- Prefix: string (nullable = true)
 |-- FirstName: string (nullable = true)
 |-- LastName: string (nullable = true)
 |-- BirthDate: string (nullable = true)
 |-- MaritalStatus: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- EmailAddress: string (nullable = true)
 |-- AnnualIncome: string (nullable = true)
 |-- TotalChildren: string (nullable = true)
 |-- EducationLevel: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- HomeOwner: string (nullable = true)

+-----------+------+---------+--------+---------+-------------+------+--------------------+------------+-------------+---------------+--------------+---------+
|CustomerKey|Prefix|FirstName|LastName|BirthDate|MaritalStatus|Gender|        EmailAddress|AnnualIncome|TotalChildren| EducationLevel|    Occupation|HomeOwner|
+-----------+------+---------+--------+---------+-------------+------+--------------------+------------+-------------+--------

In [3]:
# Data Quality Checks
from pyspark.sql.functions import *

df_customers_Bronze.groupBy("CustomerKey") \
    .count() \
    .filter(col("count") > 1) \
    .show()

df_any_null_or_blank = df_customers_Bronze.filter(
    " OR ".join(
        [f"({c} IS NULL OR {c} = '')" for c in df_customers_Bronze.columns]
    )
)

df_any_null_or_blank.show(truncate=False)


StatementMeta(, ea931cf2-8a02-484e-9924-ced42b6d850c, 5, Finished, Available, Finished)

+-----------+-----+
|CustomerKey|count|
+-----------+-----+
+-----------+-----+

+-----------+------+---------+---------+----------+-------------+------+-------------------------------+------------+-------------+-------------------+--------------+---------+
|CustomerKey|Prefix|FirstName|LastName |BirthDate |MaritalStatus|Gender|EmailAddress                   |AnnualIncome|TotalChildren|EducationLevel     |Occupation    |HomeOwner|
+-----------+------+---------+---------+----------+-------------+------+-------------------------------+------------+-------------+-------------------+--------------+---------+
|11025      |NULL  |ALEJANDRO|BECK     |12/23/1945|M            |NA    |alejandro45@adventure-works.com|$10,000     |2            |Partial High School|Clerical      |Y        |
|11035      |NULL  |WENDY    |DOMINGUEZ|2/24/1948 |M            |NA    |wendy12@adventure-works.com    |$10,000     |2            |Partial High School|Clerical      |Y        |
|11082      |NULL  |ANGELA   |BUTL

In [4]:
cols_to_check = ["Prefix", "MaritalStatus", "Gender", "TotalChildren", "EducationLevel", "Occupation","HomeOwner"]

for c in cols_to_check:
    print(f"\n===== {c} =====")
    df_customers_Bronze.groupBy(c).count().show()


StatementMeta(, ea931cf2-8a02-484e-9924-ced42b6d850c, 6, Finished, Available, Finished)


===== Prefix =====
+------+-----+
|Prefix|count|
+------+-----+
|   MS.| 2470|
|  NULL|  130|
|  MRS.| 6422|
|   MR.| 9126|
+------+-----+


===== MaritalStatus =====
+-------------+-----+
|MaritalStatus|count|
+-------------+-----+
|            M| 9817|
|            S| 8331|
+-------------+-----+


===== Gender =====
+------+-----+
|Gender|count|
+------+-----+
|     F| 8892|
|    NA|  130|
|     M| 9126|
+------+-----+


===== TotalChildren =====
+-------------+-----+
|TotalChildren|count|
+-------------+-----+
|            3| 2153|
|            0| 5080|
|            5| 1401|
|            1| 3552|
|            4| 2259|
|            2| 3703|
+-------------+-----+


===== EducationLevel =====
+-------------------+-----+
|     EducationLevel|count|
+-------------------+-----+
|        High School| 3241|
|Partial High School| 1555|
|    Graduate Degree| 3125|
|    Partial College| 4966|
|          Bachelors| 5261|
+-------------------+-----+


===== Occupation =====
+--------------+----

In [5]:
# Transform Customer Data

df_customers_Silver = (
    df_customers_Bronze
        # --- Cast columns to correct types ---
        .withColumn("CustomerKey", col("CustomerKey").cast("int"))
        .withColumn("TotalChildren", col("TotalChildren").cast("int"))
        .withColumn("BirthDate", to_date(col("BirthDate"), "M/d/yyyy"))
        .withColumn("EmailAddress", lower(col("EmailAddress")))
        
        .withColumn("AnnualIncome", regexp_replace(col("AnnualIncome"), "[$,]", ""))
        .withColumn("AnnualIncome", col("AnnualIncome").cast("double"))

        # --- Create Full Name column ---
        .withColumn("FullName", concat_ws(" ", col("FirstName"), col("LastName")))
                
        # --- Clean empty strings -> null ---
        .na.replace("", None)
        
        # --- Remove duplicates (keeping unique customer keys) ---
        .dropDuplicates(["CustomerKey"])
)

df_customers_Silver.show()
df_customers_Silver.printSchema()

StatementMeta(, ea931cf2-8a02-484e-9924-ced42b6d850c, 7, Finished, Available, Finished)

+-----------+------+---------+--------+----------+-------------+------+--------------------+------------+-------------+---------------+--------------+---------+-----------------+
|CustomerKey|Prefix|FirstName|LastName| BirthDate|MaritalStatus|Gender|        EmailAddress|AnnualIncome|TotalChildren| EducationLevel|    Occupation|HomeOwner|         FullName|
+-----------+------+---------+--------+----------+-------------+------+--------------------+------------+-------------+---------------+--------------+---------+-----------------+
|      11000|   MR.|      JON|    YANG|1966-04-08|            M|     M|jon24@adventure-w...|     90000.0|            2|      Bachelors|  Professional|        Y|         JON YANG|
|      11001|   MR.|   EUGENE|   HUANG|1965-05-14|            S|     M|eugene10@adventur...|     60000.0|            3|      Bachelors|  Professional|        N|     EUGENE HUANG|
|      11002|   MR.|    RUBEN|  TORRES|1965-08-12|            M|     M|ruben35@adventure...|     60000.0|

In [6]:
# Write DimCustomer Table to Silver Layer (Delta Format)

df_customers_Silver.write\
    .mode("append")\
    .format("delta")\
    .saveAsTable("DimCustomer")

StatementMeta(, ea931cf2-8a02-484e-9924-ced42b6d850c, 8, Finished, Available, Finished)

In [7]:
# Verify if Silver write Succeeded
df_dimcustomer_silver_check = spark.read.table("DimCustomer")
df_dimcustomer_silver_check.show(5)
df_dimcustomer_silver_check.printSchema()

StatementMeta(, ea931cf2-8a02-484e-9924-ced42b6d850c, 9, Finished, Available, Finished)

+-----------+------+---------+--------+----------+-------------+------+--------------------+------------+-------------+--------------+------------+---------+-----------------+
|CustomerKey|Prefix|FirstName|LastName| BirthDate|MaritalStatus|Gender|        EmailAddress|AnnualIncome|TotalChildren|EducationLevel|  Occupation|HomeOwner|         FullName|
+-----------+------+---------+--------+----------+-------------+------+--------------------+------------+-------------+--------------+------------+---------+-----------------+
|      11000|   MR.|      JON|    YANG|1966-04-08|            M|     M|jon24@adventure-w...|     90000.0|            2|     Bachelors|Professional|        Y|         JON YANG|
|      11001|   MR.|   EUGENE|   HUANG|1965-05-14|            S|     M|eugene10@adventur...|     60000.0|            3|     Bachelors|Professional|        N|     EUGENE HUANG|
|      11002|   MR.|    RUBEN|  TORRES|1965-08-12|            M|     M|ruben35@adventure...|     60000.0|            3| 