DATA ANALYSIS PRODUCTS BRONZE --> SILVER

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [2]:
df = spark.read.parquet("s3a://pedro-datalake-project/bronze/products/")
df.show(5)
df.printSchema()
df.count()

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+
|1e9e8ef04dbcff454...|           perfumaria|                 40|                       287|                 1|             225|               16|               10|              14|
|3aa071139cb16b67c...|                artes|                 44|                       276|                 1|            1000|               30|               18|              20|
|96bd76ec8810374ed...|        esporte_lazer|                 46|                       250|    

32951

In [3]:
df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns]).show(vertical=True)


-RECORD 0-------------------------
 product_id                 | 0   
 product_category_name      | 0   
 product_name_lenght        | 610 
 product_description_lenght | 610 
 product_photos_qty         | 610 
 product_weight_g           | 2   
 product_length_cm          | 2   
 product_height_cm          | 2   
 product_width_cm           | 2   



In [4]:
df.groupBy("product_id").count().filter("count > 1").show()


+----------+-----+
|product_id|count|
+----------+-----+
+----------+-----+



In [5]:
df.select("product_category_name").distinct().show(50, truncate=False)


+----------------------------------------------+
|product_category_name                         |
+----------------------------------------------+
|pcs                                           |
|bebes                                         |
|artes                                         |
|cine_foto                                     |
|moveis_decoracao                              |
|pc_gamer                                      |
|construcao_ferramentas_construcao             |
|tablets_impressao_imagem                      |
|fashion_roupa_masculina                       |
|artigos_de_festas                             |
|artigos_de_natal                              |
|la_cuisine                                    |
|flores                                        |
|livros_tecnicos                               |
|telefonia_fixa                                |
|construcao_ferramentas_seguranca              |
|cool_stuff                                    |
|eletrodomesticos   

In [6]:
df.select("product_weight_g", "product_length_cm", "product_height_cm", "product_width_cm").summary().show()


+-------+------------------+------------------+------------------+------------------+
|summary|  product_weight_g| product_length_cm| product_height_cm|  product_width_cm|
+-------+------------------+------------------+------------------+------------------+
|  count|             32949|             32949|             32949|             32949|
|   mean|2276.4724877841513| 30.81507784758263|16.937661234028347|23.196728277034204|
| stddev| 4282.038730977024|16.914458054065953|13.637554061749569|12.079047453227794|
|    min|                 0|                 7|                 2|                 6|
|    25%|               300|                18|                 8|                15|
|    50%|               700|                25|                13|                20|
|    75%|              1900|                38|                21|                30|
|    max|             40425|               105|               105|               118|
+-------+------------------+------------------+-------

In [11]:
df_silver = (
    df
    .withColumn("product_category_name", F.lower(F.trim("product_category_name")))
)

In [12]:
replacements = {
    "á": "a", "à": "a", "ã": "a", "â": "a",
    "é": "e", "ê": "e",
    "í": "i",
    "ó": "o", "ô": "o", "õ": "o",
    "ú": "u",
    "ç": "c"
}

for accented, plain in replacements.items():
    df_silver = df_silver.withColumn(
        "product_category_name",
        F.regexp_replace("product_category_name", accented, plain)
    )

In [13]:
df_silver = (
    df_silver
    .withColumn("audit_timestamp", F.current_timestamp())
    .select(
        "product_id",
        "product_category_name",
        "product_name_lenght",
        "product_description_lenght",
        "product_photos_qty",
        "product_weight_g",
        "product_length_cm",
        "product_height_cm",
        "product_width_cm",
        "audit_timestamp"
    )
)

In [15]:
df_silver.show()
df_silver.printSchema()

+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+--------------------+
|          product_id|product_category_name|product_name_lenght|product_description_lenght|product_photos_qty|product_weight_g|product_length_cm|product_height_cm|product_width_cm|     audit_timestamp|
+--------------------+---------------------+-------------------+--------------------------+------------------+----------------+-----------------+-----------------+----------------+--------------------+
|1e9e8ef04dbcff454...|           perfumaria|                 40|                       287|                 1|             225|               16|               10|              14|2025-12-12 09:59:...|
|3aa071139cb16b67c...|                artes|                 44|                       276|                 1|            1000|               30|               18|              20|2025-12-12 0