VALIDATED DIM SELLERS

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [7]:
df = spark.read.parquet(
    "s3a://pedro-datalake-project/gold/dim_sellers/"
)
df.printSchema()
df.show(10, truncate=False)


root
 |-- seller_id: string (nullable = true)
 |-- seller_zip_code_prefix: string (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)
 |-- audit_timestamp: timestamp (nullable = true)
 |-- region: string (nullable = true)
 |-- created_at: timestamp (nullable = true)

+--------------------------------+----------------------+-----------------+------------+-------------------------+--------+--------------------------+
|seller_id                       |seller_zip_code_prefix|seller_city      |seller_state|audit_timestamp          |region  |created_at                |
+--------------------------------+----------------------+-----------------+------------+-------------------------+--------+--------------------------+
|3442f8959a84dea7ee197c632cb2df15|13023                 |campinas         |SP          |2025-12-09 10:15:51.42234|Sudeste |2025-12-09 11:04:58.166797|
|d1b65fc7debc3361ea86b5f14c68d2e2|13844                 |mogi guacu       |

In [8]:
df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in ["seller_id", "seller_state", "region"]
]).show()


+---------+------------+------+
|seller_id|seller_state|region|
+---------+------------+------+
|        0|           0|     0|
+---------+------------+------+



In [10]:
df.groupBy("region").count().orderBy("count", ascending=False).show()


+------------+-----+
|      region|count|
+------------+-----+
|     Sudeste| 2287|
|         Sul|  668|
|Centro-Oeste|   79|
|    Nordeste|   56|
|       Norte|    5|
+------------+-----+



In [11]:
df.select(
    F.min("created_at").alias("min_created_at"),
    F.max("created_at").alias("max_created_at")
).show(truncate=False)


+--------------------------+--------------------------+
|min_created_at            |max_created_at            |
+--------------------------+--------------------------+
|2025-12-09 11:04:58.166797|2025-12-09 11:04:58.166797|
+--------------------------+--------------------------+

