# Silver Layer - Transformação de Endereços

In [0]:
%run ./00_Setup_Environment

In [0]:
%sql
/*SELECT *
FROM bronze.address
LIMIT 10*/

In [0]:
%sql
/*SELECT *
FROM bronze.customer_address
LIMIT 10*/

In [0]:
%sql
/*
SELECT AddressType, count(*)
FROM bronze.customer_address
GROUP BY AddressType*/

In [0]:
df_address = spark.table("bronze.address")

In [0]:
df_silver_address = df_address.alias("a") \
    .select(
        # Identificadores
        col("a.AddressID").alias("address_id"),
        
        # Endereço limpo e padronizado
        initcap(trim(col("a.AddressLine1"))).alias("address_line1"),
        initcap(trim(col("a.AddressLine2"))).alias("address_line2"),
        initcap(trim(col("a.City"))).alias("city"),
        initcap(trim(col("a.StateProvince"))).alias("state_province"),
        initcap(trim(col("a.CountryRegion"))).alias("country_region"),
        trim(col("a.PostalCode")).alias("postal_code"),
        
        # Endereço completo
        concat_ws(", ",
            initcap(trim(col("a.AddressLine1"))),
            when(col("a.AddressLine2").isNotNull(), initcap(trim(col("a.AddressLine2")))),
            initcap(trim(col("a.City"))),
            initcap(trim(col("a.StateProvince"))),
            initcap(trim(col("a.CountryRegion"))),
            trim(col("a.PostalCode"))
        ).alias("full_address"),
        
        # Metadados
        col("a.ModifiedDate").cast("timestamp").alias("source_modified_date"),
        current_timestamp().alias("processed_timestamp"),
        
        # Validações
        when(col("a.AddressLine1").isNotNull(), True).otherwise(False).alias("is_valid_address"),
        when(col("a.PostalCode").rlike("^[0-9]{5}(-[0-9]{4})?$"), True).otherwise(False).alias("is_valid_postal_code")
    ) \
    .distinct()


In [0]:
display(df_silver_address)

In [0]:
path = f"{silver_path}/address"
df_silver_address.write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save(path)

spark.sql("""USE adventureworks.silver""")
df_silver_address.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("address")

count = df_silver_address.count()
log_etl("address", "silver", "SUCCESS", count)

print(f"Silver Address: {count} registros")
