## Transformations for tables

In [0]:
table_name=[]
for i in dbutils.fs.ls('mnt/bronze/SalesLT/'):
    table_name.append(i.name.split('/')[0])
table_name

Out[1]: ['Address',
 'Customer',
 'CustomerAddress',
 'Product',
 'ProductCategory',
 'ProductDescription',
 'ProductModel',
 'ProductModelProductDescription',
 'SalesOrderDetail',
 'SalesOrderHeader']

In [0]:
path = '/mnt/bronze/SalesLT/Address/Address.parquet'
df = spark.read.format('parquet').load(path)
display(df)

AddressID,AddressLine1,AddressLine2,City,StateProvince,CountryRegion,PostalCode,rowguid,ModifiedDate
9,8713 Yosemite Ct.,,Bothell,Washington,United States,98011,268af621-76d7-4c78-9441-144fd139821a,2006-07-01T00:00:00.000+0000
11,1318 Lasalle Street,,Bothell,Washington,United States,98011,981b3303-aca2-49c7-9a96-fb670785b269,2007-04-01T00:00:00.000+0000
25,9178 Jumping St.,,Dallas,Texas,United States,75201,c8df3bd9-48f0-4654-a8dd-14a67a84d3c6,2006-09-01T00:00:00.000+0000
28,9228 Via Del Sol,,Phoenix,Arizona,United States,85004,12ae5ee1-fc3e-468b-9b92-3b970b169774,2005-09-01T00:00:00.000+0000
32,26910 Indela Road,,Montreal,Quebec,Canada,H1Y 2H5,84a95f62-3ae8-4e7e-bbd5-5a6f00cd982d,2006-08-01T00:00:00.000+0000
185,2681 Eagle Peak,,Bellevue,Washington,United States,98004,7bccf442-2268-46cc-8472-14c44c14e98c,2006-09-01T00:00:00.000+0000
297,7943 Walnut Ave,,Renton,Washington,United States,98055,52410da4-2778-4b1d-a599-95746625ce6d,2006-08-01T00:00:00.000+0000
445,6388 Lake City Way,,Burnaby,British Columbia,Canada,V5A 3A6,53572f25-9133-4a8b-a065-102ff35416ee,2006-09-01T00:00:00.000+0000
446,52560 Free Street,,Toronto,Ontario,Canada,M4B 1V7,801a1dfc-5125-486b-aa84-ccbd2ec57ca4,2005-08-01T00:00:00.000+0000
447,22580 Free Street,,Toronto,Ontario,Canada,M4B 1V7,88cee379-dbb8-433b-b84e-a35e09435500,2006-08-01T00:00:00.000+0000


## Counting number of NAN and NULL values for each column in each table

In [0]:
from pyspark.sql.functions import isnan, when, count, col

#Line to identify how many null or nan fields has each column.
#Was put a logic to get Null values in datetime or time columns
for i in table_name:
    path = '/mnt/bronze/SalesLT/' + i + '/' + i + '.parquet'
    df = spark.read.format('parquet').load(path)
    df.select(*[
        (
            count(when((isnan(c) | col(c).isNull()), c)) if t not in ("timestamp", "date", "boolean", "binary")
            else count(when(col(c).isNull(), c))
        ).alias(c)
        for c, t in df.dtypes if c in df.columns
    ]).show()

#Fulling values with unknown
#df = df.fillna({'AddressLine2': 'unknown'})

#df.show()

+---------+------------+------------+----+-------------+-------------+----------+-------+------------+
|AddressID|AddressLine1|AddressLine2|City|StateProvince|CountryRegion|PostalCode|rowguid|ModifiedDate|
+---------+------------+------------+----+-------------+-------------+----------+-------+------------+
|        0|           0|         439|   0|            0|            0|         0|      0|           0|
+---------+------------+------------+----+-------------+-------------+----------+-------+------------+

+----------+---------+-----+---------+----------+--------+------+-----------+-----------+------------+-----+------------+------------+-------+------------+
|CustomerID|NameStyle|Title|FirstName|MiddleName|LastName|Suffix|CompanyName|SalesPerson|EmailAddress|Phone|PasswordHash|PasswordSalt|rowguid|ModifiedDate|
+----------+---------+-----+---------+----------+--------+------+-----------+-----------+------------+-----+------------+------------+-------+------------+
|         0|    

## Setting standard values for empty fields based on type

In [0]:
from pyspark.sql.functions import from_utc_timestamp, date_format
from pyspark.sql.types import TimestampType

for i in table_name:
    path = '/mnt/bronze/SalesLT/' + i + '/' + i + '.parquet'
    df = spark.read.format('parquet').load(path)

    #Code to parse date columns as dateformat
    for col in df.columns:
        if "Date" in col or "date" in col:
            df = df.withColumn(col, date_format(from_utc_timestamp(df[col].cast(TimestampType()), "UTC"), "yyyy-MM-dd"))
    
    #Code to full empty values.
    #When column has number, we fill them with -1
    #When columns has string, we fill them with NA
    #When columns has date, we fill them with 9999-12-31 date
    if i=="Address":
        #Fulling values with unknown
        df = df.fillna({'AddressLine2': 'unknown'})
    elif i=="Customer":
        df = df.fillna({'Title':'NA', 'MiddleName': 'NA', 'Suffix':'NA'})
    #elif i=='CustomerAddress':
    elif i=='Product':
        df = df.fillna({'Color':'NA', 'Size': -1, 'Weight':-1, 'SellEndDate':'9999-12-31', 'DiscontinuedDate':'9999-12-31'})
    elif i=='ProductCategory':
        df = df.fillna({'ParentProductCategoryID': -1})
    #elif i=='ProductDescription':
    elif i=='ProductModel':
        df = df.fillna({'CatalogDescription': 'NA'})
    #elif i=='ProductModelProductDescription':
    #elif i=='SalesOrderDetail':
    elif i=='SalesOrderHeader':
        df = df.fillna({'CreditCardApprovalCode':-1, 'Comment': 'NA'})
    
    output_path = '/mnt/silver/SalesLT/' + i + '/'
    df.write.format('delta').mode("overwrite").save(output_path)
        

Hola
