CREATED DIM CUSTOMER

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [7]:
silver_customer = spark.read.parquet(
    "s3a://pedro-datalake-project/silver/customers/"
)

silver_orders = spark.read.parquet(
    "s3a://pedro-datalake-project/silver/orders/"
)


In [8]:
order_metrics = (
    silver_orders
    .groupBy("customer_id")
    .agg(
        F.min("order_purchase_timestamp").alias("first_order_date"),
        F.max("order_purchase_timestamp").alias("last_order_date"),
        F.count("*").alias("total_orders")
    )
)


In [9]:
max_date = silver_orders.select(
    F.max("order_purchase_timestamp")
).collect()[0][0]

order_metrics = order_metrics.withColumn(
    "recency_days",
    F.datediff(F.lit(max_date), F.col("last_order_date"))
)


In [10]:
window_spec = Window.orderBy("c.customer_id")

dim_customer = (
    silver_customer.alias("c")
    .join(
        order_metrics.alias("m"),
        F.col("c.customer_id") == F.col("m.customer_id"),
        "left"
    )
    .select(
        F.row_number().over(window_spec).alias("customer_sk"),
        F.col("c.customer_id"),
        F.col("c.customer_unique_id"),
        F.col("c.customer_zip_code_prefix"),
        F.col("c.customer_city"),
        F.col("c.customer_state"),
        F.col("m.first_order_date"),
        F.col("m.last_order_date"),
        F.col("m.total_orders"),
        F.col("m.recency_days")
    )
)


In [11]:
regions = {
    "AC": "Norte", "AM": "Norte", "RO": "Norte", "RR": "Norte", "PA": "Norte", "AP": "Norte", "TO": "Norte",
    "MA": "Nordeste", "PI": "Nordeste", "CE": "Nordeste", "RN": "Nordeste", "PB": "Nordeste", "PE": "Nordeste",
    "AL": "Nordeste", "SE": "Nordeste", "BA": "Nordeste",
    "SP": "Sudeste", "RJ": "Sudeste", "MG": "Sudeste", "ES": "Sudeste",
    "PR": "Sul", "SC": "Sul", "RS": "Sul",
    "MT": "Centro-Oeste", "MS": "Centro-Oeste", "GO": "Centro-Oeste", "DF": "Centro-Oeste"
}

region_expr = F.create_map(
    [F.lit(x) for pair in regions.items() for x in pair]
)

dim_customer = dim_customer.withColumn(
    "region",
    region_expr[F.col("customer_state")]
)


In [12]:
dim_customer = dim_customer.withColumn(
    "created_at",
    F.current_timestamp()
)


In [13]:
dim_customer.show(20, truncate=False)
dim_customer.printSchema()
dim_customer.count()


+-----------+--------------------------------+--------------------------------+------------------------+--------------------+--------------+-------------------+-------------------+------------+------------+--------+--------------------------+
|customer_sk|customer_id                     |customer_unique_id              |customer_zip_code_prefix|customer_city       |customer_state|first_order_date   |last_order_date    |total_orders|recency_days|region  |created_at                |
+-----------+--------------------------------+--------------------------------+------------------------+--------------------+--------------+-------------------+-------------------+------------+------------+--------+--------------------------+
|1          |00012a2ce6f8dcda20d059ce98491703|248ffe10d632bebe4f7267f1f44844c9|06273                   |Osasco              |SP            |2017-11-14 16:08:26|2017-11-14 16:08:26|1           |337         |Sudeste |2025-12-12 11:27:28.643222|
|2          |000161a058600d5

99441