In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json

In [0]:
# mount ADLS
tiers = ['bronze', 'silver', 'gold']
adls_path = {tier: f"abfss://{tier}@salesadls2.dfs.core.windows.net/" for tier in tiers}
# assign the reference path to each ADLS
bronze_adls = adls_path['bronze']
silver_adls = adls_path['silver']
gold_adls = adls_path['gold']
# list the files in the bronze ADLS
dbutils.fs.ls(f"{bronze_adls}/SalesLT")

[FileInfo(path='abfss://bronze@salesadls2.dfs.core.windows.net/SalesLT/Address/', name='Address/', size=0, modificationTime=1746921052000),
 FileInfo(path='abfss://bronze@salesadls2.dfs.core.windows.net/SalesLT/Customer/', name='Customer/', size=0, modificationTime=1746921051000),
 FileInfo(path='abfss://bronze@salesadls2.dfs.core.windows.net/SalesLT/CustomerAddress/', name='CustomerAddress/', size=0, modificationTime=1746921054000),
 FileInfo(path='abfss://bronze@salesadls2.dfs.core.windows.net/SalesLT/Product/', name='Product/', size=0, modificationTime=1746921054000),
 FileInfo(path='abfss://bronze@salesadls2.dfs.core.windows.net/SalesLT/ProductCategory/', name='ProductCategory/', size=0, modificationTime=1746921050000),
 FileInfo(path='abfss://bronze@salesadls2.dfs.core.windows.net/SalesLT/ProductDescription/', name='ProductDescription/', size=0, modificationTime=1746921050000),
 FileInfo(path='abfss://bronze@salesadls2.dfs.core.windows.net/SalesLT/ProductModel/', name='ProductMode

In [0]:
# read any one table to check the structure
df = spark.read\
    .format('parquet')\
        .load(f"{bronze_adls}/SalesLT/Address/Address.parquet")
df.limit(20).display()

AddressID,AddressLine1,AddressLine2,City,StateProvince,CountryRegion,PostalCode,rowguid,ModifiedDate
9,8713 Yosemite Ct.,,Bothell,Washington,United States,98011,268af621-76d7-4c78-9441-144fd139821a,2006-07-01T00:00:00Z
11,1318 Lasalle Street,,Bothell,Washington,United States,98011,981b3303-aca2-49c7-9a96-fb670785b269,2007-04-01T00:00:00Z
25,9178 Jumping St.,,Dallas,Texas,United States,75201,c8df3bd9-48f0-4654-a8dd-14a67a84d3c6,2006-09-01T00:00:00Z
28,9228 Via Del Sol,,Phoenix,Arizona,United States,85004,12ae5ee1-fc3e-468b-9b92-3b970b169774,2005-09-01T00:00:00Z
32,26910 Indela Road,,Montreal,Quebec,Canada,H1Y 2H5,84a95f62-3ae8-4e7e-bbd5-5a6f00cd982d,2006-08-01T00:00:00Z
185,2681 Eagle Peak,,Bellevue,Washington,United States,98004,7bccf442-2268-46cc-8472-14c44c14e98c,2006-09-01T00:00:00Z
297,7943 Walnut Ave,,Renton,Washington,United States,98055,52410da4-2778-4b1d-a599-95746625ce6d,2006-08-01T00:00:00Z
445,6388 Lake City Way,,Burnaby,British Columbia,Canada,V5A 3A6,53572f25-9133-4a8b-a065-102ff35416ee,2006-09-01T00:00:00Z
446,52560 Free Street,,Toronto,Ontario,Canada,M4B 1V7,801a1dfc-5125-486b-aa84-ccbd2ec57ca4,2005-08-01T00:00:00Z
447,22580 Free Street,,Toronto,Ontario,Canada,M4B 1V7,88cee379-dbb8-433b-b84e-a35e09435500,2006-08-01T00:00:00Z


In [0]:
# check th schema
df.printSchema()

root
 |-- AddressID: integer (nullable = true)
 |-- AddressLine1: string (nullable = true)
 |-- AddressLine2: string (nullable = true)
 |-- City: string (nullable = true)
 |-- StateProvince: string (nullable = true)
 |-- CountryRegion: string (nullable = true)
 |-- PostalCode: string (nullable = true)
 |-- rowguid: string (nullable = true)
 |-- ModifiedDate: timestamp (nullable = true)



In [0]:
# the date format is dirty and not easy to read
df.select('ModifiedDate').limit(20).display()

ModifiedDate
2006-07-01T00:00:00Z
2007-04-01T00:00:00Z
2006-09-01T00:00:00Z
2005-09-01T00:00:00Z
2006-08-01T00:00:00Z
2006-09-01T00:00:00Z
2006-08-01T00:00:00Z
2006-09-01T00:00:00Z
2005-08-01T00:00:00Z
2006-08-01T00:00:00Z


In [0]:
# format the date column 
# in the other tables, date might be in sting or other formats, so we use to_date to convert to the date format first
df = df.withColumn('ModifiedDate', date_format(to_date('ModifiedDate'), 'yyyy-MM-dd'))
df.limit(20).display()

AddressID,AddressLine1,AddressLine2,City,StateProvince,CountryRegion,PostalCode,rowguid,ModifiedDate
9,8713 Yosemite Ct.,,Bothell,Washington,United States,98011,268af621-76d7-4c78-9441-144fd139821a,2006-07-01
11,1318 Lasalle Street,,Bothell,Washington,United States,98011,981b3303-aca2-49c7-9a96-fb670785b269,2007-04-01
25,9178 Jumping St.,,Dallas,Texas,United States,75201,c8df3bd9-48f0-4654-a8dd-14a67a84d3c6,2006-09-01
28,9228 Via Del Sol,,Phoenix,Arizona,United States,85004,12ae5ee1-fc3e-468b-9b92-3b970b169774,2005-09-01
32,26910 Indela Road,,Montreal,Quebec,Canada,H1Y 2H5,84a95f62-3ae8-4e7e-bbd5-5a6f00cd982d,2006-08-01
185,2681 Eagle Peak,,Bellevue,Washington,United States,98004,7bccf442-2268-46cc-8472-14c44c14e98c,2006-09-01
297,7943 Walnut Ave,,Renton,Washington,United States,98055,52410da4-2778-4b1d-a599-95746625ce6d,2006-08-01
445,6388 Lake City Way,,Burnaby,British Columbia,Canada,V5A 3A6,53572f25-9133-4a8b-a065-102ff35416ee,2006-09-01
446,52560 Free Street,,Toronto,Ontario,Canada,M4B 1V7,801a1dfc-5125-486b-aa84-ccbd2ec57ca4,2005-08-01
447,22580 Free Street,,Toronto,Ontario,Canada,M4B 1V7,88cee379-dbb8-433b-b84e-a35e09435500,2006-08-01


In [0]:
# there might be date columns in the other tables as well
# iterate through each table and format the date column if the table has any
table_names = []
for items in dbutils.fs.ls(f"{bronze_adls}/SalesLT"):
    table_names.append(items.name.split('/')[0])
table_names

['Address',
 'Customer',
 'CustomerAddress',
 'Product',
 'ProductCategory',
 'ProductDescription',
 'ProductModel',
 'ProductModelProductDescription',
 'SalesOrderDetail',
 'SalesOrderHeader']

In [0]:
df.columns

['AddressID',
 'AddressLine1',
 'AddressLine2',
 'City',
 'StateProvince',
 'CountryRegion',
 'PostalCode',
 'rowguid',
 'ModifiedDate']

In [0]:
# function definition to change the column names to snake_case
"""
Convert column names from PascalCase or camelCase to snake_case in a PySpark DataFrame.
Args:
    df (DataFrame): The input DataFrame with columns to be renamed.
Returns:
    DataFrame: A new DataFrame with column names converted to snake_case.
"""

def rename_columns_to_snake_case(df):
    def to_snake_case(name):
        result = []
        for i, char in enumerate(name):
            if char.isupper() and i > 0 and not name[i - 1].isupper():
                result.append('_')
            result.append(char.lower())
        return ''.join(result)

    # Build rename map
    rename_map = {}
    for col in df.columns:
        new_col = to_snake_case(col)
        if new_col in rename_map.values():
            raise ValueError(f"Duplicate column name after renaming: '{new_col}'")
        rename_map[col] = new_col

    # Apply renaming
    for old_name, new_name in rename_map.items():
        df = df.withColumnRenamed(old_name, new_name)

    return df


In [0]:
# iterate through each table and for each table convert the column name to snake_case and format the date column
for name in table_names:
    df = spark.read\
        .format('parquet')\
            .load(f"{bronze_adls}/SalesLT/{name}/{name}.parquet")
    df = rename_columns_to_snake_case(df)
    cols = df.columns
    for names in cols:
        if 'Date' in names or 'date' in names:
            df = df.withColumn(names, date_format(to_date(names), 'yyyy-MM-dd'))
    df.write\
        .format('delta')\
            .mode('overwrite')\
                .option("mergeSchema", "true")\
                    .save(f"{silver_adls}/SalesLT/{name}")


In [0]:
# check if the changes have been applied to the last df in the iteration
df.limit(20).display()

sales_order_id,revision_number,order_date,due_date,ship_date,status,online_order_flag,sales_order_number,purchase_order_number,account_number,customer_id,ship_to_address_id,bill_to_address_id,ship_method,credit_card_approval_code,sub_total,tax_amt,freight,total_due,comment,rowguid,modified_date
71774,2,2008-06-01,2008-06-13,2008-06-08,5,False,SO71774,PO348186287,10-4020-000609,29847,1092,1092,CARGO TRANSPORT 5,,880.3484,70.4279,22.0087,972.785,,89e42cdc-8506-48a2-b89b-eb3e64e3554e,2008-06-08
71776,2,2008-06-01,2008-06-13,2008-06-08,5,False,SO71776,PO19952192051,10-4020-000106,30072,640,640,CARGO TRANSPORT 5,,78.81,6.3048,1.9703,87.0851,,8a3448c5-e677-4158-a29b-dd33069be0b0,2008-06-08
71780,2,2008-06-01,2008-06-13,2008-06-08,5,False,SO71780,PO19604173239,10-4020-000340,30113,653,653,CARGO TRANSPORT 5,,38418.6895,3073.4952,960.4672,42452.6519,,a47665d2-7ac9-4cf3-8a8b-2a3883554284,2008-06-08
71782,2,2008-06-01,2008-06-13,2008-06-08,5,False,SO71782,PO19372114749,10-4020-000582,29485,1086,1086,CARGO TRANSPORT 5,,39785.3304,3182.8264,994.6333,43962.7901,,f1be45a5-5c57-4a50-93c6-5f8be44cb7cb,2008-06-08
71783,2,2008-06-01,2008-06-13,2008-06-08,5,False,SO71783,PO19343113609,10-4020-000024,29957,992,992,CARGO TRANSPORT 5,,83858.4261,6708.6741,2096.4607,92663.5609,,7db2329e-6446-42a8-8915-9c8370b68ed8,2008-06-08
71784,2,2008-06-01,2008-06-13,2008-06-08,5,False,SO71784,PO19285135919,10-4020-000448,29736,659,659,CARGO TRANSPORT 5,,108561.8317,8684.9465,2714.0458,119960.824,,ca31f324-2c32-4f8d-95eb-596e7f343027,2008-06-08
71796,2,2008-06-01,2008-06-13,2008-06-08,5,False,SO71796,PO17052159664,10-4020-000420,29660,1058,1058,CARGO TRANSPORT 5,,57634.6342,4610.7707,1440.8659,63686.2708,,917ef5ba-f32d-4563-8588-66db0bcdc846,2008-06-08
71797,2,2008-06-01,2008-06-13,2008-06-08,5,False,SO71797,PO16501134889,10-4020-000142,29796,642,642,CARGO TRANSPORT 5,,78029.6898,6242.3752,1950.7422,86222.8072,,bb3fee84-c8bf-4dd2-bcca-675ab6a11c38,2008-06-08
71815,2,2008-06-01,2008-06-13,2008-06-08,5,False,SO71815,PO13021155785,10-4020-000276,30089,1034,1034,CARGO TRANSPORT 5,,1141.5782,91.3263,28.5395,1261.444,,2aa5f39b-1096-4a4b-b17b-f10504a397ce,2008-06-08
71816,2,2008-06-01,2008-06-13,2008-06-08,5,False,SO71816,PO12992180445,10-4020-000295,30027,1038,1038,CARGO TRANSPORT 5,,3398.1659,271.8533,84.9541,3754.9733,,e3c189e7-98de-4c40-b6c2-0d1d13f9bb33,2008-06-08


In [0]:
# define the output variables
output_data = {'bronze_adls': bronze_adls,
               'silver_adls': silver_adls,
               'gold_adls': gold_adls}
dbutils.notebook.exit(json.dumps(output_data))