VALIDATED FACT-ORDERS

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [7]:
df = spark.read.parquet(
    "s3a://pedro-datalake-project/gold/fact_orders/"
)
df.printSchema()


root
 |-- customer_id: string (nullable = true)
 |-- seller_id: string (nullable = true)
 |-- order_id: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- order_purchase_timestamp: timestamp (nullable = true)
 |-- order_approved_at: timestamp (nullable = true)
 |-- order_delivered_carrier_date: timestamp (nullable = true)
 |-- order_delivered_customer_date: timestamp (nullable = true)
 |-- order_estimated_delivery_date: timestamp (nullable = true)
 |-- order_item_id: integer (nullable = true)
 |-- product_id: string (nullable = true)
 |-- shipping_limit_date: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- freight_value: double (nullable = true)
 |-- total_paid: double (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- payment_installments: integer (nullable = true)
 |-- seller_zip_code_prefix: string (nullable = true)
 |-- seller_city: string (nullable = true)
 |-- seller_state: string (nullable = true)
 |-- customer_sk: int

In [8]:
df.show(10, truncate=False)
df.count()

+--------------------------------+--------------------------------+--------------------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+-------------+--------------------------------+-------------------+------+-------------+----------+------------+--------------------+----------------------+---------------------+------------+-----------+-------------+----------+--------------------------+
|customer_id                     |seller_id                       |order_id                        |order_status|order_purchase_timestamp|order_approved_at  |order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|order_item_id|product_id                      |shipping_limit_date|price |freight_value|total_paid|payment_type|payment_installments|seller_zip_code_prefix|seller_city          |seller_state|customer_sk|delivery_days|is_delayed|created_at            

113425

In [3]:
df.select("order_id").distinct().count()


99441

In [4]:
df.select(
    F.min("total_paid").alias("min_paid"),
    F.max("total_paid").alias("max_paid"),
    F.avg("total_paid").alias("avg_paid")
).show()


+--------+--------+-----------------+
|min_paid|max_paid|         avg_paid|
+--------+--------+-----------------+
|     0.0|13664.08|180.4828574703261|
+--------+--------+-----------------+



In [5]:
df.select(
    F.min("delivery_days").alias("min_days"),
    F.max("delivery_days").alias("max_days")
).show()

df.groupBy("is_delayed").count().show()


+--------+--------+
|min_days|max_days|
+--------+--------+
|       0|     210|
+--------+--------+

+----------+------+
|is_delayed| count|
+----------+------+
|      NULL|  3229|
|      true|  8715|
|     false|101481|
+----------+------+



Some orders do not have delivery or estimated delivery dates.
For these cases, the `is_delayed` flag is null, indicating that the delay
status is not applicable (e.g., canceled or non-delivered orders).


In [6]:
df.select(
    F.count(F.when(F.col("customer_sk").isNull(), 1))
    .alias("null_customer_sk")
).show()


+----------------+
|null_customer_sk|
+----------------+
|               0|
+----------------+

