VALIDATED DIM GEOLOCATION

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [2]:
df = spark.read.parquet(
    "s3a://pedro-datalake-project/gold/dim_geolocation/"
)
df.printSchema()
df.show(10, truncate=False)


root
 |-- geolocation_zip_code_prefix: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- created_at: timestamp (nullable = true)

+---------------------------+---------+-----+-------------------------+
|geolocation_zip_code_prefix|city     |state|created_at               |
+---------------------------+---------+-----+-------------------------+
|01001                      |sao paulo|SP   |2025-12-09 11:08:36.29614|
|01002                      |sao paulo|SP   |2025-12-09 11:08:36.29614|
|01003                      |sao paulo|SP   |2025-12-09 11:08:36.29614|
|01004                      |sao paulo|SP   |2025-12-09 11:08:36.29614|
|01005                      |sao paulo|SP   |2025-12-09 11:08:36.29614|
|01006                      |sao paulo|SP   |2025-12-09 11:08:36.29614|
|01007                      |sao paulo|SP   |2025-12-09 11:08:36.29614|
|01008                      |sao paulo|SP   |2025-12-09 11:08:36.29614|
|01009                   

In [3]:
df.select("geolocation_zip_code_prefix").distinct().count(), df.count()


(19015, 19015)

In [4]:
df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in ["geolocation_zip_code_prefix", "city", "state"]
]).show()


+---------------------------+----+-----+
|geolocation_zip_code_prefix|city|state|
+---------------------------+----+-----+
|                          0|   0|    0|
+---------------------------+----+-----+



In [5]:
df.groupBy("state").count().orderBy("count", ascending=False).show()


+-----+-----+
|state|count|
+-----+-----+
|   SP| 6349|
|   MG| 1868|
|   RJ| 1390|
|   RS| 1131|
|   PR| 1046|
|   BA|  992|
|   GO|  773|
|   SC|  619|
|   PE|  596|
|   CE|  548|
|   DF|  515|
|   PB|  324|
|   ES|  315|
|   MA|  313|
|   PA|  309|
|   PI|  307|
|   RN|  279|
|   MT|  254|
|   MS|  242|
|   TO|  184|
+-----+-----+
only showing top 20 rows



In [6]:
df.select(
    F.min("created_at").alias("min_created_at"),
    F.max("created_at").alias("max_created_at")
).show()


+--------------------+--------------------+
|      min_created_at|      max_created_at|
+--------------------+--------------------+
|2025-12-09 11:08:...|2025-12-09 11:08:...|
+--------------------+--------------------+

