CREATED DIM SELLERS

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [2]:
silver_sellers = spark.read.parquet(
    "s3a://pedro-datalake-project/silver/sellers/"
)


In [3]:
regions = {
    "AC": "Norte","AM": "Norte","RO": "Norte","RR": "Norte","PA": "Norte","AP": "Norte","TO": "Norte",
    "MA": "Nordeste","PI": "Nordeste","CE": "Nordeste","RN": "Nordeste","PB": "Nordeste","PE": "Nordeste",
    "AL": "Nordeste","SE": "Nordeste","BA": "Nordeste",
    "SP": "Sudeste","RJ": "Sudeste","MG": "Sudeste","ES": "Sudeste",
    "PR": "Sul","SC": "Sul","RS": "Sul",
    "MT": "Centro-Oeste","MS": "Centro-Oeste","GO": "Centro-Oeste","DF": "Centro-Oeste"
}

region_expr = F.create_map(
    [F.lit(x) for pair in regions.items() for x in pair]
)

dim_sellers = (
    silver_sellers
    .withColumn("region", region_expr[F.col("seller_state")])
    .withColumn("created_at", F.current_timestamp())
)


In [4]:
dim_sellers.show(10, truncate=False)
dim_sellers.printSchema()
dim_sellers.count()


+--------------------------------+----------------------+-----------------+------------+--------------------------+--------+--------------------------+
|seller_id                       |seller_zip_code_prefix|seller_city      |seller_state|audit_timestamp           |region  |created_at                |
+--------------------------------+----------------------+-----------------+------------+--------------------------+--------+--------------------------+
|3442f8959a84dea7ee197c632cb2df15|13023                 |campinas         |SP          |2025-12-12 10:20:33.086415|Sudeste |2025-12-12 11:41:11.474076|
|d1b65fc7debc3361ea86b5f14c68d2e2|13844                 |mogi guacu       |SP          |2025-12-12 10:20:33.086415|Sudeste |2025-12-12 11:41:11.474076|
|ce3ad9de960102d0677a81f5d0bb7b2d|20031                 |rio de janeiro   |RJ          |2025-12-12 10:20:33.086415|Sudeste |2025-12-12 11:41:11.474076|
|c0f3eea2e14555b6faeea3dd58c1b1c3|04195                 |sao paulo        |SP          |

3095