DATA ANALYSIS SELLERS BRONZE --> SILVER

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [13]:
df = spark.read.parquet("s3a://pedro-datalake-project/bronze/sellers/")
df.show(5)
df.printSchema()
df.count()

+--------------------+----------------------+-----------------+------------+
|           seller_id|seller_zip_code_prefix|      seller_city|seller_state|
+--------------------+----------------------+-----------------+------------+
|3442f8959a84dea7e...|                 13023|         campinas|          SP|
|d1b65fc7debc3361e...|                 13844|       mogi guacu|          SP|
|ce3ad9de960102d06...|                 20031|   rio de janeiro|          RJ|
|c0f3eea2e14555b6f...|                  4195|        sao paulo|          SP|
|51a04a8a6bdcb23de...|                 12914|braganca paulista|          SP|
+--------------------+----------------------+-----------------+------------+
only showing top 5 rows

root
 |-- seller_id: string (nullable = true)
 |-- seller_zip_code_prefix: long (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)



3095

In [14]:
df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns]).show(vertical=True)


-RECORD 0---------------------
 seller_id              | 0   
 seller_zip_code_prefix | 0   
 seller_city            | 0   
 seller_state           | 0   



In [15]:
df.groupBy("seller_id").count().filter("count > 1").show()


+---------+-----+
|seller_id|count|
+---------+-----+
+---------+-----+



In [16]:
df.groupBy("seller_city").count().orderBy(F.desc("count")).show(20, truncate=False)


+---------------------+-----+
|seller_city          |count|
+---------------------+-----+
|sao paulo            |694  |
|curitiba             |127  |
|rio de janeiro       |96   |
|belo horizonte       |68   |
|ribeirao preto       |52   |
|guarulhos            |50   |
|ibitinga             |49   |
|santo andre          |45   |
|campinas             |41   |
|maringa              |40   |
|sao jose do rio preto|33   |
|sao bernardo do campo|32   |
|osasco               |32   |
|sorocaba             |32   |
|porto alegre         |28   |
|brasilia             |28   |
|londrina             |26   |
|goiania              |23   |
|joinville            |22   |
|blumenau             |21   |
+---------------------+-----+
only showing top 20 rows



In [17]:
df.select("seller_state").distinct().show()


+------------+
|seller_state|
+------------+
|          SC|
|          RO|
|          PI|
|          AM|
|          GO|
|          MT|
|          SP|
|          PB|
|          ES|
|          RS|
|          MS|
|          MG|
|          PA|
|          BA|
|          SE|
|          PE|
|          CE|
|          RN|
|          RJ|
|          MA|
+------------+
only showing top 20 rows



In [18]:
df_silver = (
    df
    .withColumn("seller_city", F.lower(F.trim("seller_city")))
    .withColumn("seller_state", F.upper("seller_state"))
    .withColumn("seller_zip_code_prefix", F.format_string("%05d", F.col("seller_zip_code_prefix")))
)


In [19]:
replacements = {
    "á": "a", "à": "a", "ã": "a", "â": "a",
    "é": "e", "ê": "e",
    "í": "i",
    "ó": "o", "ô": "o", "õ": "o",
    "ú": "u",
    "ç": "c"
}

for accented, plain in replacements.items():
    df_silver = df_silver.withColumn(
        "seller_city",
        F.regexp_replace("seller_city", accented, plain)
    )


In [20]:
df_silver.select("seller_city", "seller_state", "seller_zip_code_prefix").show(20, truncate=False)
df_silver.printSchema()


+-----------------+------------+----------------------+
|seller_city      |seller_state|seller_zip_code_prefix|
+-----------------+------------+----------------------+
|campinas         |SP          |13023                 |
|mogi guacu       |SP          |13844                 |
|rio de janeiro   |RJ          |20031                 |
|sao paulo        |SP          |04195                 |
|braganca paulista|SP          |12914                 |
|rio de janeiro   |RJ          |20920                 |
|brejao           |PE          |55325                 |
|penapolis        |SP          |16304                 |
|sao paulo        |SP          |01529                 |
|curitiba         |PR          |80310                 |
|anapolis         |GO          |75110                 |
|itirapina        |SP          |13530                 |
|sao paulo        |SP          |01222                 |
|sao paulo        |SP          |05372                 |
|tubarao          |SC          |88705           

In [21]:
df_silver = df_silver.dropDuplicates(["seller_id"])


In [22]:
df_silver = df_silver.withColumn("audit_timestamp", F.current_timestamp())


In [24]:
df_silver = df_silver.select(
    "seller_id",
    "seller_zip_code_prefix",
    "seller_city",
    "seller_state",
    "audit_timestamp"
)
df_silver.show(20, truncate=False)
df_silver.printSchema()
df_silver.count()


+--------------------------------+----------------------+---------------------+------------+--------------------------+
|seller_id                       |seller_zip_code_prefix|seller_city          |seller_state|audit_timestamp           |
+--------------------------------+----------------------+---------------------+------------+--------------------------+
|0015a82c2db000af6aaaf3ae2ecb0532|09080                 |santo andre          |SP          |2025-12-12 10:03:56.264514|
|001cca7ae9ae17fb1caed9dfb1094831|29156                 |cariacica            |ES          |2025-12-12 10:03:56.264514|
|001e6ad469a905060d959994f1b41e4f|24754                 |sao goncalo          |RJ          |2025-12-12 10:03:56.264514|
|002100f778ceb8431b7a1020ff7ab48f|14405                 |franca               |SP          |2025-12-12 10:03:56.264514|
|003554e2dce176b5555353e4f3555ac8|74565                 |goiania              |GO          |2025-12-12 10:03:56.264514|
|004c9cd9d87a3c30c522c48c4fc07416|14940 

3095