In [0]:
from pyspark.sql.functions import *

view_names = [
    "vw_dm_DimBrand",
    "vw_dm_DimClassOfTrade",
    "vw_dm_DimClassOfTradeCustomer",
    "vw_dm_DimCompany",
    "vw_dm_DimControllingArea",
    "vw_dm_DimCurrency",
    "vw_dm_DimCustomer",
    "vw_dm_DimCustomerGroup",
    "vw_dm_DimCustomerGroup5",
    "vw_dm_DimDistributionChannel",
    "vw_dm_DimDivision",
    "vw_dm_DimMainProduct",
    "vw_dm_DimMaterial",
    "vw_dm_DimProductBrand",
    "vw_dm_DimProductBrandGroup",
    "vw_dm_DimProductCategory",
    "vw_dm_DimProductHierarchy",
    "vw_dm_DimProfitCategory",
    "vw_dm_DimProfitMap",
    "vw_dm_DimSalesGroup",
    "vw_dm_DimSalesman",
    "vw_dm_DimSalesOffice",
    "vw_dm_DimSalesOrganization",
    "vw_dm_DimTradingPartner"
]


In [0]:
path_base = '/mnt/dw01-source/Dim'
[i.name for i in dbutils.fs.ls(path_base)]

Out[2]: ['vw_dm_DimBrand.txt',
 'vw_dm_DimClassOfTrade.txt',
 'vw_dm_DimClassOfTradeCustomer.txt',
 'vw_dm_DimClassOfTradeCustomerGroup/',
 'vw_dm_DimCompany.txt',
 'vw_dm_DimControllingArea.txt',
 'vw_dm_DimCurrency.txt',
 'vw_dm_DimCustomer.txt',
 'vw_dm_DimCustomerGroup.txt',
 'vw_dm_DimCustomerGroup5.txt',
 'vw_dm_DimCustomerHierarchy/',
 'vw_dm_DimDistributionChannel.txt',
 'vw_dm_DimDivision.txt',
 'vw_dm_DimMainProduct.txt',
 'vw_dm_DimMaterial.txt',
 'vw_dm_DimProductBrand.txt',
 'vw_dm_DimProductBrandGroup.txt',
 'vw_dm_DimProductCategory.txt',
 'vw_dm_DimProductHierarchy.txt',
 'vw_dm_DimProfitCategory.txt',
 'vw_dm_DimProfitMap.txt',
 'vw_dm_DimSalesGroup.txt',
 'vw_dm_DimSalesOffice.txt',
 'vw_dm_DimSalesOfficeGroup/',
 'vw_dm_DimSalesOrganization.txt',
 'vw_dm_DimSalesman.txt',
 'vw_dm_DimTradingPartner.txt']

In [0]:
len(dbutils.fs.ls(path_base))

Out[89]: 27


#### Here, the data is extracted from the source and stored in Bronze as a staging table. The source might be on-premise or located on another server, potentially far from Databricks in terms of network distance and latency. To address this, we incrementally extract the data into Bronze as a staging area, waiting for transformation.

#### This process is often performed on non-native cloud platforms or third-party software, which may present challenges in processing or accessing the data.


In [0]:
path_source = []
for i in range(27):
    print(f'Process: {i}')
    if dbutils.fs.ls(path_base)[i].path[-3:] == 'txt': #Excluded some table(as folder) and other that is not required.
        path_source.append(dbutils.fs.ls(path_base)[i].path)
print(path_source)

Process: 0
Process: 1
Process: 2
Process: 3
Process: 4
Process: 5
Process: 6
Process: 7
Process: 8
Process: 9
Process: 10
Process: 11
Process: 12
Process: 13
Process: 14
Process: 15
Process: 16
Process: 17
Process: 18
Process: 19
Process: 20
Process: 21
Process: 22
Process: 23
Process: 24
Process: 25
Process: 26
['dbfs:/mnt/dw01-source/Dim/vw_dm_DimBrand.txt', 'dbfs:/mnt/dw01-source/Dim/vw_dm_DimClassOfTrade.txt', 'dbfs:/mnt/dw01-source/Dim/vw_dm_DimClassOfTradeCustomer.txt', 'dbfs:/mnt/dw01-source/Dim/vw_dm_DimCompany.txt', 'dbfs:/mnt/dw01-source/Dim/vw_dm_DimControllingArea.txt', 'dbfs:/mnt/dw01-source/Dim/vw_dm_DimCurrency.txt', 'dbfs:/mnt/dw01-source/Dim/vw_dm_DimCustomer.txt', 'dbfs:/mnt/dw01-source/Dim/vw_dm_DimCustomerGroup.txt', 'dbfs:/mnt/dw01-source/Dim/vw_dm_DimCustomerGroup5.txt', 'dbfs:/mnt/dw01-source/Dim/vw_dm_DimDistributionChannel.txt', 'dbfs:/mnt/dw01-source/Dim/vw_dm_DimDivision.txt', 'dbfs:/mnt/dw01-source/Dim/vw_dm_DimMainProduct.txt', 'dbfs:/mnt/dw01-source/Dim/vw

In [0]:
#Variable
dbutils.widgets.text("extract_date_fix","")
extract_date_fix = dbutils.widgets.get("extract_date_fix")
dbutils.widgets.text("extract_criteria","")
extract_criteria = dbutils.widgets.get("extract_criteria")

#Constants
extract_date_inx = date_sub(current_date(), 5)

#Variable (Conditioned)
if extract_criteria == 'delta':
    extract_date = extract_date_inx
elif extract_criteria == 'full':
    extract_date = extract_date_fix
else:
    extract_date = extract_date_inx

In [0]:

for i in path_source:

    #Extract
    try:
        cleaned_path = i.replace("dbfs:", "")
        print(f"cleaned_path : {cleaned_path}")
        df = spark.read.format("csv").option("sep", "\t").option('header',True).load(cleaned_path)
        df = df.filter(col("ModifiedDate").cast("date") >= extract_date)
        print(f"Reading from: {cleaned_path}")
    except Exception as e:
        print(f"Error extract  {i}: {str(e)}")
        continue

    #Load
    try:
        path_destination = cleaned_path.replace('dw01-source','dw01bronze').replace('.txt','')
        df.write.format("parquet").mode("overwrite").save(path_destination)
        print(f"Saving to: {path_destination}")
        df.limit(5).show()
    except Exception as e:
        print(f"Error load {i}: {str(e)}")
        continue



cleaned_path : /mnt/dw01-source/Dim/vw_dm_DimBrand.txt
Reading from: /mnt/dw01-source/Dim/vw_dm_DimBrand.txt
Saving to: /mnt/dw01bronze/Dim/vw_dm_DimBrand
+---------+----------------+--------------------+------+
|BrandCode|BrandDescription|        ModifiedDate|Source|
+---------+----------------+--------------------+------+
|     0000|          Others|2024-12-11 09:43:...|   SAP|
|     0001|    BJC Services|2024-12-11 09:43:...|   SAP|
|     0002| BJC Consumables|2024-12-11 09:43:...|   SAP|
|     0003|     BJC Premium|2024-12-11 09:43:...|   SAP|
|     0004|BJC Non valuated|2024-12-11 09:43:...|   SAP|
+---------+----------------+--------------------+------+

cleaned_path : /mnt/dw01-source/Dim/vw_dm_DimClassOfTrade.txt
Reading from: /mnt/dw01-source/Dim/vw_dm_DimClassOfTrade.txt
Saving to: /mnt/dw01bronze/Dim/vw_dm_DimClassOfTrade
+----------------+--------------------+--------------------+------+
|ClassofTradeCode|    ClassofTradeName|        ModifiedDate|Source|
+----------------+-

In [0]:
 for i in path_source:
    
    #Transform
    cleaned_path = i.replace("dbfs:", "")
    path_source = cleaned_path.replace('dw01-source','dw01bronze').replace('.txt','')
    df = spark.read.format("parquet").load(path_source)
    print(f"Load to df: {path_source}")

    #Transform: Clean null or NULL
    for i in df.columns:
        path_destination = path_source.replace('vw_dm_','pre_tx_')
        df = df.withColumn(i, when( (col(i) == "NULL") | (col(i).isNull()) , lit("")).otherwise(col(i)))
        df.write.format("parquet").mode("overwrite").save(path_destination)
    print(f"write df to: {path_destination}")

Load to df: /mnt/dw01bronze/Dim/vw_dm_DimBrand
write df to: /mnt/dw01bronze/Dim/pre_tx_DimBrand
Load to df: /mnt/dw01bronze/Dim/vw_dm_DimClassOfTrade
write df to: /mnt/dw01bronze/Dim/pre_tx_DimClassOfTrade
Load to df: /mnt/dw01bronze/Dim/vw_dm_DimClassOfTradeCustomer
write df to: /mnt/dw01bronze/Dim/pre_tx_DimClassOfTradeCustomer
Load to df: /mnt/dw01bronze/Dim/vw_dm_DimCompany
write df to: /mnt/dw01bronze/Dim/pre_tx_DimCompany
Load to df: /mnt/dw01bronze/Dim/vw_dm_DimControllingArea
write df to: /mnt/dw01bronze/Dim/pre_tx_DimControllingArea
Load to df: /mnt/dw01bronze/Dim/vw_dm_DimCurrency
write df to: /mnt/dw01bronze/Dim/pre_tx_DimCurrency
Load to df: /mnt/dw01bronze/Dim/vw_dm_DimCustomer
write df to: /mnt/dw01bronze/Dim/pre_tx_DimCustomer
Load to df: /mnt/dw01bronze/Dim/vw_dm_DimCustomerGroup
write df to: /mnt/dw01bronze/Dim/pre_tx_DimCustomerGroup
Load to df: /mnt/dw01bronze/Dim/vw_dm_DimCustomerGroup5
write df to: /mnt/dw01bronze/Dim/pre_tx_DimCustomerGroup5
Load to df: /mnt/dw01b