VALIDATED DIM CUSTOMER

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [2]:
df = spark.read.parquet(
    "s3a://pedro-datalake-project/gold/dim_customer/"
)
df.printSchema()
df.show(10, truncate=False)


root
 |-- customer_sk: integer (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_unique_id: string (nullable = true)
 |-- customer_zip_code_prefix: string (nullable = true)
 |-- customer_city: string (nullable = true)
 |-- customer_state: string (nullable = true)
 |-- first_order_date: timestamp (nullable = true)
 |-- last_order_date: timestamp (nullable = true)
 |-- total_orders: long (nullable = true)
 |-- recency_days: integer (nullable = true)
 |-- region: string (nullable = true)
 |-- created_at: timestamp (nullable = true)

+-----------+--------------------------------+--------------------------------+------------------------+--------------+--------------+-------------------+-------------------+------------+------------+-------+--------------------------+
|customer_sk|customer_id                     |customer_unique_id              |customer_zip_code_prefix|customer_city |customer_state|first_order_date   |last_order_date    |total_orders|recency_days|reg

In [3]:
df.select("customer_sk").distinct().count(), df.count()


(99441, 99441)

In [4]:
df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in [
        "customer_sk",
        "customer_id",
        "customer_unique_id",
        "customer_state"
    ]
]).show()


+-----------+-----------+------------------+--------------+
|customer_sk|customer_id|customer_unique_id|customer_state|
+-----------+-----------+------------------+--------------+
|          0|          0|                 0|             0|
+-----------+-----------+------------------+--------------+



In [5]:
df.select(
    F.min("total_orders").alias("min_orders"),
    F.max("total_orders").alias("max_orders"),
    F.avg("total_orders").alias("avg_orders")
).show()


+----------+----------+----------+
|min_orders|max_orders|avg_orders|
+----------+----------+----------+
|         1|         1|       1.0|
+----------+----------+----------+



In [6]:
df.select(
    F.min("recency_days").alias("min_recency"),
    F.max("recency_days").alias("max_recency")
).show()


+-----------+-----------+
|min_recency|max_recency|
+-----------+-----------+
|          0|        773|
+-----------+-----------+



In [7]:
df.groupBy("region").count().orderBy("count", ascending=False).show()


+------------+-----+
|      region|count|
+------------+-----+
|     Sudeste|68266|
|         Sul|14148|
|    Nordeste| 9394|
|Centro-Oeste| 5782|
|       Norte| 1851|
+------------+-----+

