DATA ANALYSIS ITEMS BRONZE --> SILVER

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [2]:
df = spark.read.parquet("s3a://pedro-datalake-project/bronze/items/")
df.show(5)
df.printSchema()
df.count()

+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35| 58.9|        13.29|
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13|239.9|        19.93|
|000229ec398224ef6...|            1|c777355d18b72b67a...|5b51032eddd242adc...|2018-01-18 14:48:30|199.0|        17.87|
|00024acbcdf0a6daa...|            1|7634da152a4610f15...|9d7a1d34a50524090...|2018-08-15 10:10:18|12.99|        12.79|
|00042b26cf59d7ce6...|            1|ac6c3623068f30de0...|df560393f3a51e745...|2017-02-13 13:57:51|199.9|        18.14|
+--------------------+-------------+------------

112650

In [3]:
df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns]).show(vertical=True)

-RECORD 0------------------
 order_id            | 0   
 order_item_id       | 0   
 product_id          | 0   
 seller_id           | 0   
 shipping_limit_date | 0   
 price               | 0   
 freight_value       | 0   



In [4]:
df.groupBy("order_id", "order_item_id").count().filter("count > 1").show(20, truncate=False)


+--------+-------------+-----+
|order_id|order_item_id|count|
+--------+-------------+-----+
+--------+-------------+-----+



In [6]:
df_silver = (
    df
    .withColumn("order_item_id", F.col("order_item_id").cast("int"))
    .withColumn("shipping_limit_date", F.to_timestamp("shipping_limit_date"))
    .withColumn("price", F.col("price").cast("double"))
    .withColumn("freight_value", F.col("freight_value").cast("double"))
    .filter(
        F.col("order_id").isNotNull() &
        F.col("product_id").isNotNull() &
        F.col("seller_id").isNotNull()
    )
)


In [7]:
df_silver.printSchema()


root
 |-- order_id: string (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)



In [8]:
df_silver = df_silver.select(
    "order_id", "order_item_id", "product_id", "seller_id",
    "shipping_limit_date", "price", "freight_value"
)


In [9]:
df_silver = df_silver.withColumn("audit_timestamp", F.current_timestamp())


In [10]:
df_silver.show(20, truncate=False)
df_silver.printSchema()


+--------------------------------+-------------+--------------------------------+--------------------------------+-------------------+------+-------------+--------------------------+
|order_id                        |order_item_id|product_id                      |seller_id                       |shipping_limit_date|price |freight_value|audit_timestamp           |
+--------------------------------+-------------+--------------------------------+--------------------------------+-------------------+------+-------------+--------------------------+
|00010242fe8c5a6d1ba2dd792cb16214|1            |4244733e06e7ecb4970a6e2683c13e61|48436dade18ac8b2bce089ec2a041202|2017-09-19 09:45:35|58.9  |13.29        |2025-12-12 09:33:08.527525|
|00018f77f2f0320c557190d7a144bdd3|1            |e5f2d52b802189ee658865ca93d83a8f|dd7ddc04e1b6c2c614352b383efe2d36|2017-05-03 11:05:13|239.9 |19.93        |2025-12-12 09:33:08.527525|
|000229ec398224ef6ca0657da4fc703e|1            |c777355d18b72b67abbeef9df44fd0fd|5b51