VALIDATED FACT-SELLER-PERFORMANCE

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [2]:
df = spark.read.parquet(
    "s3a://pedro-datalake-project/gold/fact_seller_performance/"
)
df.printSchema()
df.show(10, truncate=False)


root
 |-- seller_id: string (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)
 |-- region: string (nullable = true)
 |-- total_orders: long (nullable = true)
 |-- total_customers: long (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- total_freight: double (nullable = true)
 |-- avg_delivery_time: double (nullable = true)
 |-- late_deliveries: long (nullable = true)
 |-- avg_order_value: double (nullable = true)
 |-- on_time_rate: double (nullable = true)
 |-- created_at: timestamp (nullable = true)

+--------------------------------+--------------+------------+-------+------------+---------------+------------------+------------------+------------------+---------------+------------------+------------------+--------------------------+
|seller_id                       |seller_city   |seller_state|region |total_orders|total_customers|total_revenue     |total_freight     |avg_delivery_time |late_deliveries|avg_order_v

In [3]:
df.select("seller_id").distinct().count(), df.count()


(3096, 3096)

In [4]:
df.select(
    F.min("total_orders").alias("min_orders"),
    F.max("total_orders").alias("max_orders"),
    F.avg("total_orders").alias("avg_orders")
).show()


+----------+----------+----------------+
|min_orders|max_orders|      avg_orders|
+----------+----------+----------------+
|         1|      2033|36.6359819121447|
+----------+----------+----------------+



In [5]:
df.select(
    F.min("on_time_rate").alias("min_rate"),
    F.max("on_time_rate").alias("max_rate")
).show()


+--------+--------+
|min_rate|max_rate|
+--------+--------+
|     0.0|     1.0|
+--------+--------+



In [6]:
df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in ["seller_id", "total_orders", "on_time_rate"]
]).show()


+---------+------------+------------+
|seller_id|total_orders|on_time_rate|
+---------+------------+------------+
|        1|           0|           0|
+---------+------------+------------+



In [7]:
df.groupBy("region").count().orderBy("count", ascending=False).show()


+------------+-----+
|      region|count|
+------------+-----+
|     Sudeste| 2287|
|         Sul|  668|
|Centro-Oeste|   79|
|    Nordeste|   56|
|       Norte|    5|
|        NULL|    1|
+------------+-----+

