# **Bronze to Silver Transformation â€” FactSales**

## Introduction
This notebook loads the three Bronze Sales datasets (2015/2016/2017), combines them, applies data type conversions, derives new fields, performs data quality checks, and writes the FactSales table to the Silver layer in Delta format.

In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

StatementMeta(, 7c34d3c3-fe6a-4ae8-bed9-e2465ab1f5b1, 3, Finished, Available, Finished)

In [2]:
# Load Bronze Data
df_sales_2015 = spark.read.format("csv").option("header", True).option("inferScheme", True).load("abfss://MonoWS@onelake.dfs.fabric.microsoft.com/MonoLH_Bronze.Lakehouse/Files/Raw/AdventureWorks_Sales_2015/AdventureWorks_Sales_2015.csv")
df_sales_2016 = spark.read.format("csv").option("header", True).option("inferScheme", True).load("abfss://MonoWS@onelake.dfs.fabric.microsoft.com/MonoLH_Bronze.Lakehouse/Files/Raw/AdventureWorks_Sales_2016/AdventureWorks_Sales_2016.csv")
df_sales_2017 = spark.read.format("csv").option("header", True).option("inferScheme", True).load("abfss://MonoWS@onelake.dfs.fabric.microsoft.com/MonoLH_Bronze.Lakehouse/Files/Raw/AdventureWorks_Sales_2017/AdventureWorks_Sales_2017.csv")

print("2015:", df_sales_2015.count())
print("2016:", df_sales_2016.count())
print("2017:", df_sales_2017.count())

StatementMeta(, 7c34d3c3-fe6a-4ae8-bed9-e2465ab1f5b1, 4, Finished, Available, Finished)

2015: 2630
2016: 23935
2017: 29481


In [3]:
# Combine Sales 2015, 2016 and 2017

df_sales_bronze = (
    df_sales_2015
        .unionByName(df_sales_2016)
        .unionByName(df_sales_2017)
)

df_sales_bronze.show(5)
df_sales_bronze.printSchema()
df_sales_bronze.count()

StatementMeta(, 7c34d3c3-fe6a-4ae8-bed9-e2465ab1f5b1, 5, Finished, Available, Finished)

+---------+----------+-----------+----------+-----------+------------+-------------+-------------+
|OrderDate| StockDate|OrderNumber|ProductKey|CustomerKey|TerritoryKey|OrderLineItem|OrderQuantity|
+---------+----------+-----------+----------+-----------+------------+-------------+-------------+
| 1/1/2015| 9/21/2001|    SO45080|       332|      14657|           1|            1|            1|
| 1/1/2015| 12/5/2001|    SO45079|       312|      29255|           4|            1|            1|
| 1/1/2015|10/29/2001|    SO45082|       350|      11455|           9|            1|            1|
| 1/1/2015|11/16/2001|    SO45081|       338|      26782|           6|            1|            1|
| 1/2/2015|12/15/2001|    SO45083|       312|      14947|          10|            1|            1|
+---------+----------+-----------+----------+-----------+------------+-------------+-------------+
only showing top 5 rows

root
 |-- OrderDate: string (nullable = true)
 |-- StockDate: string (nullable = tru

56046

In [4]:
df_sales_bronze.groupBy("OrderNumber", "OrderLineItem").count().filter(col("count") > 1).show()


StatementMeta(, 7c34d3c3-fe6a-4ae8-bed9-e2465ab1f5b1, 6, Finished, Available, Finished)

+-----------+-------------+-----+
|OrderNumber|OrderLineItem|count|
+-----------+-------------+-----+
+-----------+-------------+-----+



In [5]:
# Transform Fact Sales

df_sales_silver = (
    df_sales_bronze
    
        # --- Convert dates ---
        .withColumn("OrderDate", to_date(col("OrderDate"), "M/d/yyyy"))
        .withColumn("StockDate", to_date(col("StockDate"), "M/d/yyyy"))

        # --- Cast keys ---
        .withColumn("ProductKey", col("ProductKey").cast("int"))
        .withColumn("CustomerKey", col("CustomerKey").cast("int"))
        .withColumn("TerritoryKey", col("TerritoryKey").cast("int"))

        # --- Cast numeric fields ---
        .withColumn("OrderLineItem", col("OrderLineItem").cast("int"))
        .withColumn("OrderQuantity", col("OrderQuantity").cast("int"))
)


StatementMeta(, 7c34d3c3-fe6a-4ae8-bed9-e2465ab1f5b1, 7, Finished, Available, Finished)

In [6]:
# Preview dtype and data

df_sales_silver.show()
df_sales_silver.printSchema()

StatementMeta(, 7c34d3c3-fe6a-4ae8-bed9-e2465ab1f5b1, 8, Finished, Available, Finished)

+----------+----------+-----------+----------+-----------+------------+-------------+-------------+
| OrderDate| StockDate|OrderNumber|ProductKey|CustomerKey|TerritoryKey|OrderLineItem|OrderQuantity|
+----------+----------+-----------+----------+-----------+------------+-------------+-------------+
|2015-01-01|2001-09-21|    SO45080|       332|      14657|           1|            1|            1|
|2015-01-01|2001-12-05|    SO45079|       312|      29255|           4|            1|            1|
|2015-01-01|2001-10-29|    SO45082|       350|      11455|           9|            1|            1|
|2015-01-01|2001-11-16|    SO45081|       338|      26782|           6|            1|            1|
|2015-01-02|2001-12-15|    SO45083|       312|      14947|          10|            1|            1|
|2015-01-02|2001-10-12|    SO45084|       310|      29143|           4|            1|            1|
|2015-01-02|2001-12-18|    SO45086|       314|      18747|           9|            1|            1|


In [7]:
#Check null keys
df_sales_silver.filter(col("ProductKey").isNull()).count()
df_sales_silver.filter(col("CustomerKey").isNull()).count()
df_sales_silver.filter(col("TerritoryKey").isNull()).count()

#Check invalid dates
df_sales_silver.filter(col("OrderDate").isNull()).count()
df_sales_silver.filter(col("StockDate").isNull()).count()

StatementMeta(, 7c34d3c3-fe6a-4ae8-bed9-e2465ab1f5b1, 9, Finished, Available, Finished)

0

In [8]:
# Write FactSales Table to Silver Layer (Delta Format)

df_sales_silver.write\
    .mode("append")\
    .format("delta")\
    .saveAsTable("FactSales")

StatementMeta(, 7c34d3c3-fe6a-4ae8-bed9-e2465ab1f5b1, 10, Finished, Available, Finished)

In [9]:
# Verify if Silver write Succeeded
df_sales_silver_check = spark.read.table("FactSales")
df_sales_silver.show(5)
df_sales_silver.printSchema()

StatementMeta(, 7c34d3c3-fe6a-4ae8-bed9-e2465ab1f5b1, 11, Finished, Available, Finished)

+----------+----------+-----------+----------+-----------+------------+-------------+-------------+
| OrderDate| StockDate|OrderNumber|ProductKey|CustomerKey|TerritoryKey|OrderLineItem|OrderQuantity|
+----------+----------+-----------+----------+-----------+------------+-------------+-------------+
|2015-01-01|2001-09-21|    SO45080|       332|      14657|           1|            1|            1|
|2015-01-01|2001-12-05|    SO45079|       312|      29255|           4|            1|            1|
|2015-01-01|2001-10-29|    SO45082|       350|      11455|           9|            1|            1|
|2015-01-01|2001-11-16|    SO45081|       338|      26782|           6|            1|            1|
|2015-01-02|2001-12-15|    SO45083|       312|      14947|          10|            1|            1|
+----------+----------+-----------+----------+-----------+------------+-------------+-------------+
only showing top 5 rows

root
 |-- OrderDate: date (nullable = true)
 |-- StockDate: date (nullable 