DATA ANALYSIS ORDERS BRONZE --> SILVER

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [2]:
df = spark.read.parquet("s3a://pedro-datalake-project/bronze/orders/")
df.show(5)
df.printSchema()
df.count()

+--------------------+--------------------+------------+--------------------+-------------------+--------------------+--------------------+--------------------+
|                col0|                col1|        col2|                col3|               col4|                col5|                col6|                col7|
+--------------------+--------------------+------------+--------------------+-------------------+--------------------+--------------------+--------------------+
|            order_id|         customer_id|order_status|order_purchase_ti...|  order_approved_at|order_delivered_c...|order_delivered_c...|order_estimated_d...|
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered| 2017-10-02 10:56:33|2017-10-02 11:07:15| 2017-10-04 19:55:00| 2017-10-10 21:25:13| 2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered| 2018-07-24 20:41:37|2018-07-26 03:24:27| 2018-07-26 14:31:00| 2018-08-07 15:27:45| 2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b0

99442

In [3]:
df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns]).show(vertical=True)


-RECORD 0---
 col0 | 0   
 col1 | 0   
 col2 | 0   
 col3 | 0   
 col4 | 0   
 col5 | 0   
 col6 | 0   
 col7 | 0   



In [4]:
df.groupBy("col0").count().filter("count > 1").show()


+----+-----+
|col0|count|
+----+-----+
+----+-----+



In [5]:
df_clean = df.filter(df["col0"] != "order_id")
df_clean.show(5)


+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+-------------------+
|                col0|                col1|     col2|               col3|               col4|               col5|               col6|               col7|
+--------------------+--------------------+---------+-------------------+-------------------+-------------------+-------------------+-------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|delivered|2017-10-02 10:56:33|2017-10-02 11:07:15|2017-10-04 19:55:00|2017-10-10 21:25:13|2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|delivered|2018-07-24 20:41:37|2018-07-26 03:24:27|2018-07-26 14:31:00|2018-08-07 15:27:45|2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|delivered|2018-08-08 08:38:49|2018-08-08 08:55:23|2018-08-08 13:50:00|2018-08-17 18:06:29|2018-09-04 00:00:00|
|949d5b44dbf5de918...|f88197465ea7920ad...|delivered|2017-11-18 19:28:06|201

In [7]:
df_silver = (
    df_clean
    .select(
        F.col("col0").alias("order_id"),
        F.col("col1").alias("customer_id"),
        F.col("col2").alias("order_status"),
        F.to_timestamp("col3").alias("order_purchase_timestamp"),
        F.to_timestamp("col4").alias("order_approved_at"),
        F.to_timestamp("col5").alias("order_delivered_carrier_date"),
        F.to_timestamp("col6").alias("order_delivered_customer_date"),
        F.to_timestamp("col7").alias("order_estimated_delivery_date"),
    )
    .withColumn("order_status", F.lower("order_status"))
    .filter(F.col("order_id").isNotNull())
)
df_silver.show(5)
df_silver.printSchema()

+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|            order_id|         customer_id|order_status|order_purchase_timestamp|  order_approved_at|order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|
+--------------------+--------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+
|e481f51cbdc54678b...|9ef432eb625129730...|   delivered|     2017-10-02 10:56:33|2017-10-02 11:07:15|         2017-10-04 19:55:00|          2017-10-10 21:25:13|          2017-10-18 00:00:00|
|53cdb2fc8bc7dce0b...|b0830fb4747a6c6d2...|   delivered|     2018-07-24 20:41:37|2018-07-26 03:24:27|         2018-07-26 14:31:00|          2018-08-07 15:27:45|          2018-08-13 00:00:00|
|47770eb9100c2d0c4...|41ce2a54c0b03bf34...|  

In [9]:
df_silver = df_silver.withColumn("audit_timestamp", F.current_timestamp())


In [10]:
df_silver.show(20, truncate=False)
df_silver.printSchema()


+--------------------------------+--------------------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------------+
|order_id                        |customer_id                     |order_status|order_purchase_timestamp|order_approved_at  |order_delivered_carrier_date|order_delivered_customer_date|order_estimated_delivery_date|audit_timestamp           |
+--------------------------------+--------------------------------+------------+------------------------+-------------------+----------------------------+-----------------------------+-----------------------------+--------------------------+
|e481f51cbdc54678b7cc49136f2d6af7|9ef432eb6251297304e76186b10a928d|delivered   |2017-10-02 10:56:33     |2017-10-02 11:07:15|2017-10-04 19:55:00         |2017-10-10 21:25:13          |2017-10-18 00:00:00          |2025-12-12 09:38:07.791117|
|53cdb2fc8bc7dce0b6741e215027345