In [0]:
#Check files in DBFS
spark.sql("USE retail_lakehouse")

dim_path = "dbfs:/tmp/retail/landing/dimensions/"
display(dbutils.fs.ls(dim_path))

path,name,size,modificationTime
dbfs:/tmp/retail/landing/dimensions/customers.csv,customers.csv,2883950,1768359149000
dbfs:/tmp/retail/landing/dimensions/products.csv,products.csv,101361,1768359135000
dbfs:/tmp/retail/landing/dimensions/stores.csv,stores.csv,1726,1768359134000


In [0]:
#Creating silver_stores
from pyspark.sql import functions as F

stores_df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(dim_path + "stores.csv")
)

# standardize column names
stores_df = stores_df.select([F.col(c).alias(c.strip().lower()) for c in stores_df.columns])

display(stores_df.limit(10))

(
    stores_df.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("retail_lakehouse.silver_stores")
)

print("✅ Written: retail_lakehouse.silver_stores")

store_id,city,region,store_type
S_0001,Sligo,Connacht,Physical
S_0002,Waterford,Munster,Physical
S_0003,Athlone,Leinster,Physical
S_0004,Galway,Connacht,Physical
S_0005,Cork,Munster,Physical
S_0006,Dundalk,Leinster,Physical
S_0007,Galway,Connacht,Physical
S_0008,Limerick,Munster,Physical
S_0009,Dublin,Leinster,Physical
S_0010,Wexford,Leinster,Physical


✅ Written: retail_lakehouse.silver_stores


In [0]:
#Creating silver_products
products_df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(dim_path + "products.csv")
)

products_df = products_df.select([F.col(c).alias(c.strip().lower()) for c in products_df.columns])

display(products_df.limit(10))

(
    products_df.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("retail_lakehouse.silver_products")
)

print("✅ Written: retail_lakehouse.silver_products")

product_id,product_name,category,brand,cost_price
P_000001,Electronics Item 000001,Electronics,Logitech,89.5
P_000002,Grocery Item 000002,Grocery,Dunnes,18.47
P_000003,Electronics Item 000003,Electronics,Xiaomi,89.4
P_000004,Books Item 000004,Books,Penguin,71.68
P_000005,Electronics Item 000005,Electronics,Logitech,13.06
P_000006,Grocery Item 000006,Grocery,Nestle,73.04
P_000007,Books Item 000007,Books,HarperCollins,86.49
P_000008,Books Item 000008,Books,Pearson,28.01
P_000009,Books Item 000009,Books,O'Reilly,97.51
P_000010,Electronics Item 000010,Electronics,Samsung,84.38


✅ Written: retail_lakehouse.silver_products


In [0]:
#Creating silver_customers
customers_df = (
    spark.read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv(dim_path + "customers.csv")
)

customers_df = customers_df.select([F.col(c).alias(c.strip().lower()) for c in customers_df.columns])

(
    customers_df.write
    .format("delta")
    .mode("overwrite")
    .saveAsTable("retail_lakehouse.silver_customers")
)

print("✅ Written: retail_lakehouse.silver_customers")

✅ Written: retail_lakehouse.silver_customers


In [0]:
#Validating
print("silver_stores:", spark.table("retail_lakehouse.silver_stores").count())
print("silver_products:", spark.table("retail_lakehouse.silver_products").count())

silver_stores: 51
silver_products: 2000
