DATA ANALYSIS CATEGORY BRONZE --> SILVER

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [3]:
df = spark.read.parquet("s3a://pedro-datalake-project/bronze/category/")
df.show(5)
df.printSchema()
df.count()

+--------------------+--------------------+
|                col0|                col1|
+--------------------+--------------------+
|product_category_...|product_category_...|
|        beleza_saude|       health_beauty|
|informatica_acess...|computers_accesso...|
|          automotivo|                auto|
|     cama_mesa_banho|      bed_bath_table|
+--------------------+--------------------+
only showing top 5 rows

root
 |-- col0: string (nullable = true)
 |-- col1: string (nullable = true)



72

In [4]:
df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns]).show(vertical=True)


-RECORD 0---
 col0 | 0   
 col1 | 0   



In [8]:
dupes = df.groupBy("col1").count().filter(F.col("count") > 1)
dupes.show()


+----+-----+
|col1|count|
+----+-----+
+----+-----+



In [11]:
dupes2 = df.groupBy("col0").count().filter(F.col("count") > 1)
dupes2.show()


+----+-----+
|col0|count|
+----+-----+
+----+-----+



In [17]:
df_clean = (
    df.filter(
        (df["col0"] != "product_category_name") &
        (df["col1"] != "product_category_name_english")
    )
)

In [18]:
df_sel = df_clean.select(
    F.col("col1").alias("category_name_english")
)

df_sel.show(10)

+---------------------+
|category_name_english|
+---------------------+
|        health_beauty|
| computers_accesso...|
|                 auto|
|       bed_bath_table|
|      furniture_decor|
|       sports_leisure|
|            perfumery|
|           housewares|
|            telephony|
|        watches_gifts|
+---------------------+
only showing top 10 rows



In [19]:
df_pad = (
    df_sel
    .withColumn("category_name_english", F.lower(F.col("category_name_english")))
    .withColumn(
        "category_name_english",
        F.regexp_replace("category_name_english", "[^a-z0-9_]", "")
    )
)

df_pad.show(10, truncate=False)


+---------------------+
|category_name_english|
+---------------------+
|health_beauty        |
|computers_accessories|
|auto                 |
|bed_bath_table       |
|furniture_decor      |
|sports_leisure       |
|perfumery            |
|housewares           |
|telephony            |
|watches_gifts        |
+---------------------+
only showing top 10 rows



In [20]:
df_pad.select("category_name_english").distinct().show(50, truncate=False)


+---------------------------------------+
|category_name_english                  |
+---------------------------------------+
|art                                    |
|flowers                                |
|home_construction                      |
|fashion_male_clothing                  |
|kitchen_dining_laundry_garden_furniture|
|small_appliances                       |
|la_cuisine                             |
|bed_bath_table                         |
|signaling_and_security                 |
|office_furniture                       |
|computers                              |
|watches_gifts                          |
|auto                                   |
|fashion_bags_accessories               |
|construction_tools_lights              |
|cool_stuff                             |
|cds_dvds_musicals                      |
|food                                   |
|computers_accessories                  |
|perfumery                              |
|pet_shop                         

In [21]:
df_silver_test = df_pad.withColumn("audit_timestamp", F.current_timestamp())
df_silver_test.show(truncate=False)

+------------------------+--------------------------+
|category_name_english   |audit_timestamp           |
+------------------------+--------------------------+
|health_beauty           |2025-12-12 09:16:31.245819|
|computers_accessories   |2025-12-12 09:16:31.245819|
|auto                    |2025-12-12 09:16:31.245819|
|bed_bath_table          |2025-12-12 09:16:31.245819|
|furniture_decor         |2025-12-12 09:16:31.245819|
|sports_leisure          |2025-12-12 09:16:31.245819|
|perfumery               |2025-12-12 09:16:31.245819|
|housewares              |2025-12-12 09:16:31.245819|
|telephony               |2025-12-12 09:16:31.245819|
|watches_gifts           |2025-12-12 09:16:31.245819|
|food_drink              |2025-12-12 09:16:31.245819|
|baby                    |2025-12-12 09:16:31.245819|
|stationery              |2025-12-12 09:16:31.245819|
|tablets_printing_image  |2025-12-12 09:16:31.245819|
|toys                    |2025-12-12 09:16:31.245819|
|fixed_telephony         |20