VALIDATED FACT-PRODUCT-PERFORMANCE

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [3]:
df = spark.read.parquet(
    "s3a://pedro-datalake-project/gold/fact_product_performance/"
)
df.printSchema()
df.show(10, truncate=False)


root
 |-- product_id: string (nullable = true)
 |-- product_category_name: string (nullable = true)
 |-- total_orders: long (nullable = true)
 |-- total_customers: long (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- total_freight: double (nullable = true)
 |-- avg_price: double (nullable = true)
 |-- avg_freight: double (nullable = true)
 |-- late_deliveries: long (nullable = true)
 |-- pct_late: double (nullable = true)
 |-- created_at: timestamp (nullable = true)

+--------------------------------+---------------------------------+------------+---------------+------------------+------------------+------------------+------------------+---------------+--------------------+--------------------------+
|product_id                      |product_category_name            |total_orders|total_customers|total_revenue     |total_freight     |avg_price         |avg_freight       |late_deliveries|pct_late            |created_at                |
+-------------------------------

In [4]:
df.select("product_id").distinct().count(), df.count()


(32952, 32952)

In [5]:
df.select(
    F.min("total_revenue").alias("min_revenue"),
    F.max("total_revenue").alias("max_revenue"),
    F.avg("total_revenue").alias("avg_revenue")
).show()


+-----------+-----------+-----------------+
|min_revenue|max_revenue|      avg_revenue|
+-----------+-----------+-----------------+
|        2.2|    63885.0|412.4804618979827|
+-----------+-----------+-----------------+



In [6]:
df.select(
    F.min("pct_late").alias("min_pct_late"),
    F.max("pct_late").alias("max_pct_late")
).show()


+------------+------------+
|min_pct_late|max_pct_late|
+------------+------------+
|         0.0|         1.0|
+------------+------------+



In [7]:
df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in ["product_id", "total_orders", "pct_late"]
]).show()


+----------+------------+--------+
|product_id|total_orders|pct_late|
+----------+------------+--------+
|         1|           0|       0|
+----------+------------+--------+



In [8]:
df.groupBy("product_category_name").count().orderBy("count", ascending=False).show()


+---------------------+-----+
|product_category_name|count|
+---------------------+-----+
|      cama_mesa_banho| 3029|
|        esporte_lazer| 2867|
|     moveis_decoracao| 2657|
|         beleza_saude| 2444|
| utilidades_domest...| 2335|
|           automotivo| 1900|
| informatica_acess...| 1639|
|           brinquedos| 1411|
|   relogios_presentes| 1329|
|            telefonia| 1134|
|                bebes|  919|
|           perfumaria|  868|
| fashion_bolsas_e_...|  849|
|            papelaria|  849|
|           cool_stuff|  789|
|   ferramentas_jardim|  753|
|             pet_shop|  719|
|                     |  610|
|          eletronicos|  517|
| construcao_ferram...|  400|
+---------------------+-----+
only showing top 20 rows

