# Gold Layer - Dimensões (Star Schema)

In [0]:
%run ./00_Setup_Environment

In [0]:
# Gerar datas de 2011 a 2025
df_dim_date = spark.sql("""
    SELECT 
        EXPLODE(SEQUENCE(DATE'2005-01-01', DATE'2025-12-31', INTERVAL 1 DAY)) AS full_date
""").select(
    date_format(col("full_date"), "yyyyMMdd").cast("int").alias("date_key"),
    col("full_date").alias("date"),
    dayofmonth(col("full_date")).alias("day"),
    month(col("full_date")).alias("month"),
    year(col("full_date")).alias("year"),
    quarter(col("full_date")).alias("quarter"),
    weekofyear(col("full_date")).alias("week"),
    dayofweek(col("full_date")).alias("day_of_week"),
    date_format(col("full_date"), "EEEE").alias("day_name"),
    date_format(col("full_date"), "MMMM").alias("month_name"),
    concat(year(col("full_date")), lit("-Q"), quarter(col("full_date"))).alias("year_quarter"),
    when(dayofweek(col("full_date")).isin(1, 7), True).otherwise(False).alias("is_weekend")
)
#df_dim_date.limit(10).display()

In [0]:
# Salvar Dim Date
path = f"{gold_path}/dim_date"
df_dim_date.write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save(path)

df_dim_date.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("gold.dim_date")

log_etl("dim_date", "gold", "SUCCESS", df_dim_date.count())
print("gold.dim_date criada")

In [0]:
df_dim_customer = spark.table("silver.customer").alias("c") \
    .join(spark.table("silver.customer_address").alias("ca"),
          col("c.customer_id") == col("ca.customer_id"),
          "left")\
    .select(
        row_number().over(Window.orderBy("c.customer_id")).alias("customer_key"),
        col("c.customer_id"),
        col("ca.main_address_id"),
        col("ca.shipping_address_id"),
        col("c.full_name"),
        col("c.email_address"),
        col("c.source_modified_date").alias("valid_from"),
        lit(None).cast("timestamp").alias("valid_to"),
        lit(True).alias("is_current")
    )
#df_dim_customer.limit(10).display()

In [0]:
# Salvar Dim Customer
path = f"{gold_path}/dim_customer"
df_dim_customer.write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save(path)

df_dim_customer.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("gold.dim_customer")

log_etl("dim_customer", "gold", "SUCCESS", df_dim_customer.count())
print("gold.dim_customer criada")

In [0]:
df_dim_product = spark.table("silver.product") \
    .select(
        row_number().over(Window.orderBy("product_id")).alias("product_key"),
        col("product_id"),
        col("product_number"),
        col("product_name"),
        col("product_description"),
        col("category_name").alias("product_category"),
        col("model_name").alias("product_model"),
        col("color"),
        col("size"),
        col("weight"),
        col("list_price"),
        col("standard_cost"),
        col("margin_percentage"),
        col("product_status"),
        col("source_modified_date").alias("valid_from"),
        when(col("product_status") == "Descontinuado", col("source_modified_date"))
        .otherwise(lit(None)).alias("valid_to"),
        when(col("product_status") == "Ativo", True).otherwise(False).alias("is_current")
    )
#df_dim_product.limit(10).display()

In [0]:
# Salvar Dim Product
path = f"{gold_path}/dim_product"
df_dim_product.write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save(path)

df_dim_product.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("gold.dim_product")

log_etl("dim_product", "gold", "SUCCESS", df_dim_product.count())
print("gold.dim_product criada")

In [0]:
df_dim_address = spark.table("silver.address").alias("a") \
    .join(spark.table("silver.customer_address").alias("ca"),
          col("a.address_id") == col("ca.main_address_id"),
          "left") \
    .select(
        row_number().over(Window.orderBy("a.address_id")).alias("address_key"),
        col("a.address_id"),
        col("ca.customer_id"),
        col("a.address_line1"),
        col("a.address_line2"),
        col("a.city"),
        col("a.state_province"),
        col("a.country_region"),
        col("a.postal_code"),
        col("a.full_address"),
        col("a.is_valid_address"),
        col("a.is_valid_postal_code")
    ).distinct()
#df_dim_address.limit(10).display()

In [0]:
# Salvar Dim Address
path = f"{gold_path}/dim_address"
df_dim_address.write \
    .mode("overwrite") \
    .format("delta") \
    .option("overwriteSchema", "true") \
    .save(path)

df_dim_address.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable("gold.dim_address")

log_etl("dim_address", "gold", "SUCCESS", df_dim_address.count())
print("gold.dim_address criada")