In [0]:
%run ./00_Setup_Environment

In [0]:
df_product = spark.table("bronze.product")
df_category = spark.table("bronze.product_category")
df_model = spark.table("bronze.product_model")
df_description = spark.table("bronze.product_description")
df_model_desc = spark.table("bronze.product_model_product_description")


In [0]:
#df_model_desc.display()



In [0]:
# Obter descrições em inglês
df_product_descriptions = df_model_desc.alias("pmd") \
    .join(df_description.alias("pd"), 
          col("pmd.ProductDescriptionID") == col("pd.ProductDescriptionID")) \
              .select(
                  col("pmd.ProductModelID"),
                  col("pd.Description").alias("product_description")
    )
#df_product_descriptions.display()

In [0]:
#print("Número de descrições com '?': ",df_product_descriptions.where(col('product_description').contains('?')).count())
#print("Total de registros: ", df_product_descriptions.select('ProductModelID').distinct().count())

In [0]:
#df_product_descriptions.where(col('product_description').contains('?')).groupBy('ProductModelID').count().display()


In [0]:
from pyspark.sql.functions import regexp_replace

# Prepare numeric columns for margin calculation
list_price_col = regexp_replace(col("p.ListPrice"), ',', '.').cast("double")
standard_cost_col = regexp_replace(col("p.StandardCost"), ',', '.').cast("double")

# Main transformation

df_silver_product = df_product.alias("p") \
    .join(df_category.alias("pc"), 
          col("p.ProductCategoryID") == col("pc.ProductCategoryID"), 
          "left") \
    .join(df_model.alias("pm"), 
          col("p.ProductModelID") == col("pm.ProductModelID"), 
          "left") \
    .join(df_product_descriptions.alias("pd"), 
          col("p.ProductModelID") == col("pd.ProductModelID"), 
          "left") \
    .select(
        # Identificadores
        col("p.ProductID").alias("product_id"),
        col("p.ProductNumber").alias("product_number"),
        
        # Nome e descrição
        initcap(trim(col("p.Name"))).alias("product_name"),
        col("pd.product_description"),
        initcap(trim(col("pc.Name"))).alias("category_name"),
        initcap(trim(col("pm.Name"))).alias("model_name"),
        
        # Características
        trim(col("p.Color")).alias("color"),
        trim(col("p.Size")).alias("size"),
        regexp_replace(col("p.Weight"), ',','.').cast("double").alias("weight"),
        
        # Preços e custos
        list_price_col.alias("list_price"),
        standard_cost_col.alias("standard_cost"),
        
        # Status do produto
        when(col("p.SellEndDate").isNull() & col("p.DiscontinuedDate").isNull(), "Ativo")
        .when(col("p.SellEndDate").isNotNull(), "Descontinuado")
        .otherwise("Inativo").alias("product_status"),
        
        col("p.SellStartDate").cast("date"),
        col("p.SellEndDate").cast("date"),
        col("p.DiscontinuedDate").cast("date"),
        
        # Metadados
        col("p.ModifiedDate").cast("timestamp").alias("source_modified_date"),
        current_timestamp().alias("processed_timestamp")
    )

In [0]:
df_silver_product.display()

In [0]:
path = f"{silver_path}/product"
df_silver_product.write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save(path)

spark.sql(f"""USE adventureworks.silver""")
df_silver_product.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("product")

count = df_silver_product.count()
log_etl("product", "silver", "SUCCESS", count)

print(f"Silver Product: {count} registros")