# Connecting to the Storage Account and Mounting Locally

In [0]:
dbutils.secrets.listScopes()

[SecretScope(name='abcretail_secretScope')]

In [0]:
secret_scope = "abcretail_secretScope"

In [0]:
dbutils.secrets.list(secret_scope)

[SecretMetadata(key='azure-sql-server-password'),
 SecretMetadata(key='client-id'),
 SecretMetadata(key='client-secret'),
 SecretMetadata(key='databricks-token'),
 SecretMetadata(key='directory-id'),
 SecretMetadata(key='onprem-sqlserver-password')]

In [0]:
client_id = dbutils.secrets.get(secret_scope, "client-id")
client_secret = dbutils.secrets.get(secret_scope, "client-secret")
directory_id = dbutils.secrets.get(secret_scope, "directory-id")

In [0]:
configs = {
    "fs.azure.account.auth.type": "OAuth",
    "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
    "fs.azure.account.oauth2.client.id": client_id,
    "fs.azure.account.oauth2.client.secret": client_secret,
    "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{directory_id}/oauth2/token"
}

In [0]:
storage_account = "dlsaabcretail"

container_source = "curated"
container_target = "staging"

source_link = f"abfss://{container_source}@{storage_account}.dfs.core.windows.net/"
mount_point_source = f"/mnt/{storage_account}/{container_source}"

target_link = f"abfss://{container_target}@{storage_account}.dfs.core.windows.net/"
mount_point_target = f"/mnt/{storage_account}/{container_target}"

## Mounting the Source Container: "curated"

In [0]:
# Checking if any existing mount points in dbutils.fs.mounts() match the desired mount point
if any(mount.mountPoint == mount_point_source for mount in dbutils.fs.mounts()):
  # If a mount point exists, unmount it
  dbutils.fs.unmount(mount_point_source)
  print(f"Unmount existing mount at {mount_point_source}")

# Try to mount the new source to the specified mount point
try:
  dbutils.fs.mount(
    source = source_link,
    mount_point = mount_point_source,
    extra_configs = configs
  )
  print(f"Mounted successfully at {mount_point_source}")

# If an error occurs during the mount process, handle the exception
except Exception as e:
  print(f"Error mounting: {mount_point_source}")

/mnt/dlsaabcretail/curated has been unmounted.
Unmount existing mount at /mnt/dlsaabcretail/curated
Mounted successfully at /mnt/dlsaabcretail/curated


## Mounting the Target Container: "staging"

In [0]:
if any(mount.mountPoint == mount_point_target for mount in dbutils.fs.mounts()):
  dbutils.fs.unmount(mount_point_target)
  print(f"Unmount existing mount at {mount_point_target}")

try:
  dbutils.fs.mount(
    source = target_link,
    mount_point = mount_point_target,
    extra_configs = configs
  )
  print(f"Mounted successfully at {mount_point_target}")
except Exception as e:
  print(f"Error mounting: {mount_point_target}")

/mnt/dlsaabcretail/staging has been unmounted.
Unmount existing mount at /mnt/dlsaabcretail/staging
Mounted successfully at /mnt/dlsaabcretail/staging


# Importing Python Packages, Libraries, Functions and Methods

In [0]:
# from pyspark.sql.functions import current_timestamp, col, row_number
# from pyspark.sql.window import Window
# from pyspark.sql.types import IntegerType, StringType, DecimalType, BooleanType, DoubleType, BinaryType
from delta.tables import DeltaTable

# Creating File Lists

In [0]:
dbutils.fs.ls(f"{mount_point_source}/delta/http")

[FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/http/accessories/', name='accessories/', size=0, modificationTime=1732841601000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/http/clothing/', name='clothing/', size=0, modificationTime=1732841754000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/http/footwear/', name='footwear/', size=0, modificationTime=1732841869000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/http/home_decor/', name='home_decor/', size=0, modificationTime=1732841924000)]

In [0]:
http_Accessories_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/http/accessories") if ".parquet" in file.name]
http_Clothing_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/http/clothing") if ".parquet" in file.name]
http_Footwear_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/http/footwear") if ".parquet" in file.name]
http_HomeDecor_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/http/home_decor") if ".parquet" in file.name]

In [0]:
dbutils.fs.ls(f"{mount_point_source}/delta/azsqldb")

[FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/azsqldb/address/', name='address/', size=0, modificationTime=1732842109000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/azsqldb/customer/', name='customer/', size=0, modificationTime=1732842196000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/azsqldb/customer_address/', name='customer_address/', size=0, modificationTime=1732842507000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/azsqldb/product/', name='product/', size=0, modificationTime=1732842598000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/azsqldb/product_category/', name='product_category/', size=0, modificationTime=1732842669000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/azsqldb/product_description/', name='product_description/', size=0, modificationTime=1732842752000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/azsqldb/product_model/', name='product_model/', size=0, modificationTime=1732842853000),
 FileInfo

In [0]:
azsqldb_Address_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/azsqldb/address") if ".parquet" in file.name]
azsqldb_Customer_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/azsqldb/customer") if ".parquet" in file.name]
azsqldb_CustomerAddress_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/azsqldb/customer_address") if ".parquet" in file.name]
azsqldb_Product_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/azsqldb/product") if ".parquet" in file.name]
azsqldb_ProductCategory_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/azsqldb/product_category") if ".parquet" in file.name]
azsqldb_ProductDescription_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/azsqldb/product_description") if ".parquet" in file.name]
azsqldb_ProductModel_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/azsqldb/product_model") if ".parquet" in file.name]
azsqldb_ProductModelProductDescription_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/azsqldb/productmodel_productdescription") if ".parquet" in file.name]
azsqldb_SalesOrderDetail_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/azsqldb/salesorder_detail") if ".parquet" in file.name]
azsqldb_SalesOrderHeader_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/azsqldb/salesorder_header") if ".parquet" in file.name]

In [0]:
dbutils.fs.ls(f"{mount_point_source}/delta/onprem")

[FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/onprem/country_rolling/', name='country_rolling/', size=0, modificationTime=1732846892000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/onprem/customer/', name='customer/', size=0, modificationTime=1732846896000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/onprem/customer_productreview/', name='customer_productreview/', size=0, modificationTime=1732846900000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/onprem/customer_sellerreview/', name='customer_sellerreview/', size=0, modificationTime=1732846903000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/onprem/order/', name='order/', size=0, modificationTime=1732846907000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/onprem/product/', name='product/', size=0, modificationTime=1732846910000),
 FileInfo(path='dbfs:/mnt/dlsaabcretail/curated/delta/onprem/product_categories/', name='product_categories/', size=0, modificationTime=173284691300

In [0]:
onprem_CountryRolling_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/onprem/country_rolling") if ".parquet" in file.name]
onprem_Customer_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/onprem/customer") if ".parquet" in file.name]
onprem_CustomerProductReview_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/onprem/customer_productreview") if ".parquet" in file.name]
onprem_CustomerSellerReview_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/onprem/customer_sellerreview") if ".parquet" in file.name]
onprem_Order_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/onprem/order") if ".parquet" in file.name]
onprem_Product_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/onprem/product") if ".parquet" in file.name]
onprem_ProductCategories_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/onprem/product_categories") if ".parquet" in file.name]
onprem_ProductQuality_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/onprem/product_quality") if ".parquet" in file.name]
onprem_Promotion_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/onprem/promotion") if ".parquet" in file.name]
onprem_Seller_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/onprem/seller") if ".parquet" in file.name]
onprem_SellerProductPromotion_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/onprem/seller_product_promotion") if ".parquet" in file.name]
onprem_StateProvinceRolling_files = [file.path for file in dbutils.fs.ls(f"{mount_point_source}/delta/onprem/state_province_rolling") if ".parquet" in file.name]

# ETL

## Reading and Merging All Four Softline Datasets From the HTTP API Into One Dataset

In [0]:
df_http_clothing = spark.read.format("delta").load(*http_Accessories_files, header=True)
df_http_accessories.show(3)

+---------+-----------+-----------+--------------------+-----------------+------------+-----------------+------+--------------------+--------------------+
|ProductID|   Category|SubCategory|         ProductName|            Brand|       Sizes|           Colors| Price|         Description| ingestion_timestamp|
+---------+-----------+-----------+--------------------+-----------------+------------+-----------------+------+--------------------+--------------------+
|        4|Accessories|    Watches|Luxury Leather Watch|    Timepiece Co.|["One Size"]|["Brown","Black"]|149.99|A timeless leathe...|2024-11-29 02:19:...|
|        5|Accessories|    Jewelry|Silver Pendant Ne...|      ShinyThings|["One Size"]|       ["Silver"]| 39.99|A delicate silver...|2024-11-29 02:19:...|
|        6|Accessories|       Bags|    Leather Backpack|Traveler's Choice|  ["Medium"]|["Brown","Black"]| 89.99|A durable leather...|2024-11-29 02:19:...|
+---------+-----------+-----------+--------------------+--------------

In [0]:
df_http_clothing = spark.read.format("delta").load(*http_Clothing_files, header=True)
df_http_clothing.show(3)

+---------+--------+-----------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+
|ProductID|Category|SubCategory|         ProductName|       Brand|             Sizes|              Colors|Price|         Description| ingestion_timestamp|
+---------+--------+-----------+--------------------+------------+------------------+--------------------+-----+--------------------+--------------------+
|        1|Clothing|   Menswear|    Slim Fit T-Shirt| FashionWear|["S","M","L","XL"]|["Red","Blue","Bl...|29.99|A stylish slim-fi...|2024-11-29 02:20:...|
|        2|Clothing| Womenswear| Floral Summer Dress|ElegantStyle|     ["S","M","L"]|["White","Yellow"...|49.99|A light and breez...|2024-11-29 02:20:...|
|        3|Clothing|  Outerwear|Classic Denim Jacket|   UrbanEdge|    ["M","L","XL"]|    ["Blue","Black"]|69.99|A classic denim j...|2024-11-29 02:20:...|
+---------+--------+-----------+--------------------+------------+----

In [0]:
df_http_clothing = spark.read.format("delta").load(*http_Footwear_files, header=True)
df_http_footwear.show(3)

+---------+--------+-----------+-----------------+-----------+--------------------+--------------------+-----+--------------------+--------------------+
|ProductID|Category|SubCategory|      ProductName|      Brand|               Sizes|              Colors|Price|         Description| ingestion_timestamp|
+---------+--------+-----------+-----------------+-----------+--------------------+--------------------+-----+--------------------+--------------------+
|        7|Footwear|   Menswear|    Running Shoes| SpeedTrack|["8","9","10","11...|["White","Blue","...|79.99|Lightweight runni...|2024-11-29 02:20:...|
|        8|Footwear| Womenswear|High Heel Sandals|  StyleStep|["5","6","7","8",...|["Black","Red","G...|59.99|Elegant high-heel...|2024-11-29 02:20:...|
|        9|Footwear|     Unisex|  Casual Sneakers|ComfortWalk|["6","7","8","9",...|["Gray","Black","...|49.99|Comfortable sneak...|2024-11-29 02:20:...|
+---------+--------+-----------+-----------------+-----------+--------------------

In [0]:
df_http_clothing = spark.read.format("delta").load(*http_HomeDecor_files, header=True)
df_http_home_decor.show(3)

+---------+----------+-----------+--------------------+--------------+------------------+-----------------+------+--------------------+--------------------+
|ProductID|  Category|SubCategory|         ProductName|         Brand|             Sizes|           Colors| Price|         Description| ingestion_timestamp|
+---------+----------+-----------+--------------------+--------------+------------------+-----------------+------+--------------------+--------------------+
|       10|Home Decor|   Lighting|   Modern Table Lamp|   BrightHomes|      ["One Size"]|["White","Black"]| 39.99|A sleek and moder...|2024-11-29 02:20:...|
|       11|Home Decor|   Wall Art|Abstract Canvas P...|Artistic Vibes|["Medium","Large"]|   ["Multicolor"]| 99.99|A vibrant abstrac...|2024-11-29 02:20:...|
|       12|Home Decor|  Furniture| Wooden Coffee Table|  Rustic Charm|      ["One Size"]|["Brown","Black"]|129.99|A stylish wooden ...|2024-11-29 02:20:...|
+---------+----------+-----------+--------------------+---

In [0]:
df_http_products = df_http_accessories.union(df_http_clothing).union(df_http_footwear).union(df_http_home_decor) \
    .orderBy("ProductID")

df_http_products.show(3)

+---------+--------+-----------+-------------------+------------+------------------+--------------------+-----+--------------------+--------------------+
|ProductID|Category|SubCategory|        ProductName|       Brand|             Sizes|              Colors|Price|         Description| ingestion_timestamp|
+---------+--------+-----------+-------------------+------------+------------------+--------------------+-----+--------------------+--------------------+
|        1|Clothing|   Menswear|   Slim Fit T-Shirt| FashionWear|["S","M","L","XL"]|["Red","Blue","Bl...|29.99|A stylish slim-fi...|2024-11-29 00:55:...|
|        1|Clothing|   Menswear|   Slim Fit T-Shirt| FashionWear|["S","M","L","XL"]|["Red","Blue","Bl...|29.99|A stylish slim-fi...|2024-11-29 02:20:...|
|        2|Clothing| Womenswear|Floral Summer Dress|ElegantStyle|     ["S","M","L"]|["White","Yellow"...|49.99|A light and breez...|2024-11-29 02:20:...|
+---------+--------+-----------+-------------------+------------+-----------

## Loading the Merged Dataset onto the Target Container Delta Table

In [0]:
delta_http_folder = "delta/http"
delta_azsqldb_folder = "delta/azsqldb"
delta_onprem_folder = "delta/onprem"

delta_http_path = f"{mount_point_target}/{delta_http_folder}"
delta_azsqldb_path = f"{mount_point_target}/{delta_azsqldb_folder}"
delta_onprem_path = f"{mount_point_target}/{delta_onprem_folder}"

In [0]:
if DeltaTable.isDeltaTable(spark, f"{delta_http_path}/products"):
    existing_data = DeltaTable.forPath(spark, f"{delta_http_path}/products")
    
    (existing_data.alias("existing") \
        .merge(df_http_products.alias("new"), "existing.ProductID = new.ProductID") \
        .whenMatchedUpdateAll() \
        .whenNotMatchedInsertAll() \
        .execute())
else:
    df_http_products.coalesce(1).write.format("delta").mode("overwrite").save(f"{delta_http_path}/products")

spark.read.format("delta").load(f"{delta_http_path}/products").show(3)

+---------+--------+-----------+-------------------+------------+------------------+--------------------+-----+--------------------+--------------------+
|ProductID|Category|SubCategory|        ProductName|       Brand|             Sizes|              Colors|Price|         Description| ingestion_timestamp|
+---------+--------+-----------+-------------------+------------+------------------+--------------------+-----+--------------------+--------------------+
|        1|Clothing|   Menswear|   Slim Fit T-Shirt| FashionWear|["S","M","L","XL"]|["Red","Blue","Bl...|29.99|A stylish slim-fi...|2024-11-29 02:20:...|
|        1|Clothing|   Menswear|   Slim Fit T-Shirt| FashionWear|["S","M","L","XL"]|["Red","Blue","Bl...|29.99|A stylish slim-fi...|2024-11-29 00:55:...|
|        2|Clothing| Womenswear|Floral Summer Dress|ElegantStyle|     ["S","M","L"]|["White","Yellow"...|49.99|A light and breez...|2024-11-29 02:20:...|
+---------+--------+-----------+-------------------+------------+-----------