CREATED FACT PRODUCT PERFORMANCE

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [3]:
fact_orders = spark.read.parquet(
    "s3a://pedro-datalake-project/gold/fact_orders/"
)

products_dim = spark.read.parquet(
    "s3a://pedro-datalake-project/gold/dim_products/"
)


In [4]:
fact_with_product = (
    fact_orders.alias("f")
    .join(
        products_dim.select(
            "product_id",
            "product_category_name"
        ),
        "product_id",
        "left"
    )
)
fact_with_product.show(10)   

+--------------------+--------------------+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+-------------------+------+-------------+----------+------------+--------------------+----------------------+--------------------+------------+-----------+-------------+----------+--------------------+---------------------+
|          product_id|         customer_id|           seller_id|            order_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|order_item_id|shipping_limit_date| price|freight_value|total_paid|payment_type|payment_installments|seller_zip_code_prefix|         seller_city|seller_state|customer_sk|delivery_days|is_delayed|          created_at|product_category_name|
+--------------------+--------------------+-----------------

In [5]:
fact_product_performance = (
    fact_with_product
    .groupBy("product_id", "product_category_name")
    .agg(
        F.count("order_id").alias("total_orders"),
        F.countDistinct("customer_id").alias("total_customers"),
        F.sum("price").alias("total_revenue"),
        F.sum("freight_value").alias("total_freight"),
        F.avg("price").alias("avg_price"),
        F.avg("freight_value").alias("avg_freight"),
        F.sum(
            F.when(F.col("is_delayed") == True, 1).otherwise(0)
        ).alias("late_deliveries")
    )
)
fact_product_performance.show()

+--------------------+---------------------+------------+---------------+------------------+------------------+------------------+------------------+---------------+
|          product_id|product_category_name|total_orders|total_customers|     total_revenue|     total_freight|         avg_price|       avg_freight|late_deliveries|
+--------------------+---------------------+------------+---------------+------------------+------------------+------------------+------------------+---------------+
|2511e504fd7794c66...|      cama_mesa_banho|           5|              5|             839.5|            100.72|             167.9|            20.144|              0|
|fc1d8637c0268af3d...|      cama_mesa_banho|          57|             54| 8504.300000000001| 951.5899999999998| 149.1982456140351|16.694561403508768|              6|
|36f60d45225e60c7d...| informatica_acess...|         127|            111|11240.960000000003|1577.2499999999998| 88.51149606299215|12.419291338582676|             11|
|f4b

In [6]:
fact_product_performance = (
    fact_product_performance
    .withColumn(
        "pct_late",
        F.col("late_deliveries") / F.col("total_orders")
    )
    .withColumn("created_at", F.current_timestamp())
)
fact_product_performance.show()

+--------------------+---------------------+------------+---------------+------------------+------------------+------------------+------------------+---------------+-------------------+--------------------+
|          product_id|product_category_name|total_orders|total_customers|     total_revenue|     total_freight|         avg_price|       avg_freight|late_deliveries|           pct_late|          created_at|
+--------------------+---------------------+------------+---------------+------------------+------------------+------------------+------------------+---------------+-------------------+--------------------+
|2511e504fd7794c66...|      cama_mesa_banho|           5|              5|             839.5|            100.72|             167.9|            20.144|              0|                0.0|2025-12-12 12:35:...|
|fc1d8637c0268af3d...|      cama_mesa_banho|          57|             54| 8504.300000000001| 951.5899999999998| 149.1982456140351|16.694561403508768|              6|0.10526

In [7]:
fact_product_performance.printSchema()
fact_product_performance.count()


root
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- total_orders: long (nullable = false)
 |-- total_customers: long (nullable = false)
 |-- total_revenue: double (nullable = true)
 |-- total_freight: double (nullable = true)
 |-- avg_price: double (nullable = true)
 |-- avg_freight: double (nullable = true)
 |-- late_deliveries: long (nullable = true)
 |-- pct_late: double (nullable = true)
 |-- created_at: timestamp (nullable = false)



32952