DATA ANALYSIS PAYMENTS BRONZE --> SILVER

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [2]:
df = spark.read.parquet("s3a://pedro-datalake-project/bronze/payments/")
df.show(5)
df.printSchema()
df.count()

+--------------------+------------------+------------+--------------------+-------------+
|            order_id|payment_sequential|payment_type|payment_installments|payment_value|
+--------------------+------------------+------------+--------------------+-------------+
|b81ef226f3fe1789b...|                 1| credit_card|                   8|        99.33|
|a9810da82917af2d9...|                 1| credit_card|                   1|        24.39|
|25e8ea4e93396b6fa...|                 1| credit_card|                   1|        65.71|
|ba78997921bbcdc13...|                 1| credit_card|                   8|       107.78|
|42fdf880ba16b47b5...|                 1| credit_card|                   2|       128.45|
+--------------------+------------------+------------+--------------------+-------------+
only showing top 5 rows

root
 |-- order_id: string (nullable = true)
 |-- payment_sequential: long (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: l

103886

In [3]:
df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns]).show(vertical=True)


-RECORD 0-------------------
 order_id             | 0   
 payment_sequential   | 0   
 payment_type         | 0   
 payment_installments | 0   
 payment_value        | 0   



In [6]:
df.groupBy("order_id", "payment_sequential").count().filter("count > 1").show()


+--------+------------------+-----+
|order_id|payment_sequential|count|
+--------+------------------+-----+
+--------+------------------+-----+



In [7]:
df.groupBy("payment_type").count().show()


+------------+-----+
|payment_type|count|
+------------+-----+
|      boleto|19784|
| not_defined|    3|
| credit_card|76795|
|     voucher| 5775|
|  debit_card| 1529|
+------------+-----+



In [8]:
df_silver = (
    df
    .withColumn("payment_type", F.lower(F.trim("payment_type")))
    .withColumn("payment_installments", F.col("payment_installments").cast("int"))
    .withColumn("payment_value", F.col("payment_value").cast("double"))
    .filter(F.col("order_id").isNotNull())
)


In [9]:
df_silver = df_silver.dropDuplicates(["order_id", "payment_sequential"])


In [10]:
df_silver = df_silver.withColumn("audit_timestamp", F.current_timestamp())


In [11]:
df_silver = df_silver.select(
    "order_id",
    "payment_sequential",
    "payment_type",
    "payment_installments",
    "payment_value",
    "audit_timestamp"
)
df_silver.show(20, truncate=False)
df_silver.printSchema()


+--------------------------------+------------------+------------+--------------------+-------------+--------------------------+
|order_id                        |payment_sequential|payment_type|payment_installments|payment_value|audit_timestamp           |
+--------------------------------+------------------+------------+--------------------+-------------+--------------------------+
|00010242fe8c5a6d1ba2dd792cb16214|1                 |credit_card |2                   |72.19        |2025-12-12 10:09:19.070926|
|000aed2e25dbad2f9ddb70584c5a2ded|1                 |credit_card |1                   |152.77       |2025-12-12 10:09:19.070926|
|000e562887b1f2006d75e0be9558292e|1                 |credit_card |4                   |41.11        |2025-12-12 10:09:19.070926|
|000f25f4d72195062c040b12dce9a18a|1                 |credit_card |1                   |164.39       |2025-12-12 10:09:19.070926|
|001021efaa8636c29475e7734483457d|1                 |credit_card |3                   |64.1      