DATA ANALYSIS GEOLOCATION BRONZE --> SILVER

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = (
    SparkSession.builder
    .appName("S3Test")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)



In [2]:
df = spark.read.parquet("s3a://pedro-datalake-project/bronze/geolocation/")
df.show(5)
df.printSchema()
df.count()

+---------------------------+-------------------+------------------+----------------+-----------------+
|geolocation_zip_code_prefix|    geolocation_lat|   geolocation_lng|geolocation_city|geolocation_state|
+---------------------------+-------------------+------------------+----------------+-----------------+
|                       1037| -23.54562128115268|-46.63929204800168|       sao paulo|               SP|
|                       1046|-23.546081127035535|-46.64482029837157|       sao paulo|               SP|
|                       1046| -23.54612896641469|-46.64295148361138|       sao paulo|               SP|
|                       1041|  -23.5443921648681|-46.63949930627844|       sao paulo|               SP|
|                       1035|-23.541577961711493|-46.64160722329613|       sao paulo|               SP|
+---------------------------+-------------------+------------------+----------------+-----------------+
only showing top 5 rows

root
 |-- geolocation_zip_code_prefix: 

1000163

In [4]:
df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns]).show(vertical=True)

-RECORD 0--------------------------
 geolocation_zip_code_prefix | 0   
 geolocation_lat             | 0   
 geolocation_lng             | 0   
 geolocation_city            | 0   
 geolocation_state           | 0   



In [5]:
df.groupBy("geolocation_zip_code_prefix").count().orderBy(F.col("count").desc()).show(20)


+---------------------------+-----+
|geolocation_zip_code_prefix|count|
+---------------------------+-----+
|                      24220| 1146|
|                      24230| 1102|
|                      38400|  965|
|                      35500|  907|
|                      11680|  879|
|                      22631|  832|
|                      30140|  810|
|                      11740|  788|
|                      38408|  773|
|                      28970|  743|
|                      36400|  733|
|                      39400|  724|
|                      37701|  714|
|                      35162|  713|
|                      35900|  709|
|                      37200|  696|
|                      88330|  694|
|                      22790|  687|
|                      35700|  678|
|                      36570|  667|
+---------------------------+-----+
only showing top 20 rows



In [8]:
df.groupBy("geolocation_city").count().orderBy(F.col("count").desc()).show(20, truncate=False)


+---------------------+------+
|geolocation_city     |count |
+---------------------+------+
|sao paulo            |135800|
|rio de janeiro       |62151 |
|belo horizonte       |27805 |
|são paulo            |24918 |
|curitiba             |16593 |
|porto alegre         |13521 |
|salvador             |11865 |
|guarulhos            |11340 |
|brasilia             |10470 |
|sao bernardo do campo|8112  |
|osasco               |7658  |
|santo andre          |6863  |
|niteroi              |6534  |
|recife               |6168  |
|goiania              |5661  |
|fortaleza            |5538  |
|campinas             |5479  |
|sorocaba             |5361  |
|santos               |5000  |
|barueri              |4971  |
+---------------------+------+
only showing top 20 rows



In [9]:
df.select("geolocation_state").distinct().show()


+-----------------+
|geolocation_state|
+-----------------+
|               SC|
|               RO|
|               PI|
|               AM|
|               RR|
|               GO|
|               TO|
|               MT|
|               SP|
|               ES|
|               PB|
|               RS|
|               MS|
|               AL|
|               MG|
|               PA|
|               BA|
|               SE|
|               PE|
|               CE|
+-----------------+
only showing top 20 rows



In [10]:
df_silver = df.withColumn(
    "geolocation_city",
    F.lower(F.trim(F.col("geolocation_city")))
)


In [11]:
df_silver.select("geolocation_city").show(20, truncate=False)


+----------------+
|geolocation_city|
+----------------+
|sao paulo       |
|sao paulo       |
|sao paulo       |
|sao paulo       |
|sao paulo       |
|são paulo       |
|sao paulo       |
|sao paulo       |
|sao paulo       |
|sao paulo       |
|sao paulo       |
|sao paulo       |
|sao paulo       |
|sao paulo       |
|são paulo       |
|sao paulo       |
|sao paulo       |
|são paulo       |
|sao paulo       |
|sao paulo       |
+----------------+
only showing top 20 rows



In [12]:
replacements = {
    "á": "a", "à": "a", "ã": "a", "â": "a",
    "é": "e", "ê": "e",
    "í": "i",
    "ó": "o", "ô": "o", "õ": "o",
    "ú": "u",
    "ç": "c"
}

for accented, plain in replacements.items():
    df_silver = df_silver.withColumn(
        "geolocation_city",
        F.regexp_replace("geolocation_city", accented, plain)
    )


In [13]:
df_silver.select("geolocation_city").distinct().show(50, truncate=False)


+-----------------------+
|geolocation_city       |
+-----------------------+
|redencao da serra      |
|aguas de sao pedro     |
|iepe                   |
|bacaxa                 |
|divino de sao lourenco |
|divino das laranjeiras |
|vermelho               |
|cacaratiba             |
|pote                   |
|camacari               |
|itaberaba              |
|gloria                 |
|arapiraca              |
|jijoca de jericoacoara |
|jangada                |
|barracao               |
|igrejinha              |
|dilermando de aguiar   |
|itanhaem               |
|cachoeira paulista     |
|buritama               |
|uchoa                  |
|guaranta               |
|sagres                 |
|martinopolis           |
|sao joao da barra      |
|perdigao               |
|leandro ferreira       |
|astolfo dutra          |
|caparao                |
|fama                   |
|itumirim               |
|sao romao              |
|coracao de jesus       |
|sao jose da safira     |
|cachoeira  

In [14]:
df_silver = df_silver.withColumn(
    "geolocation_state",
    F.upper(F.col("geolocation_state"))
)


In [15]:
df_silver = df_silver.withColumn(
    "geolocation_zip_code_prefix",
    F.format_string("%05d", F.col("geolocation_zip_code_prefix"))
)


In [16]:
df_silver = df_silver.drop("geolocation_lat", "geolocation_lng")


In [17]:
df_silver.count()
df_silver.show(20, truncate=False)


+---------------------------+----------------+-----------------+
|geolocation_zip_code_prefix|geolocation_city|geolocation_state|
+---------------------------+----------------+-----------------+
|01037                      |sao paulo       |SP               |
|01046                      |sao paulo       |SP               |
|01046                      |sao paulo       |SP               |
|01041                      |sao paulo       |SP               |
|01035                      |sao paulo       |SP               |
|01012                      |sao paulo       |SP               |
|01047                      |sao paulo       |SP               |
|01013                      |sao paulo       |SP               |
|01029                      |sao paulo       |SP               |
|01011                      |sao paulo       |SP               |
|01013                      |sao paulo       |SP               |
|01032                      |sao paulo       |SP               |
|01014                   

In [18]:
df_silver = df_silver.withColumn("audit_timestamp", F.current_timestamp())


In [19]:
df_silver.printSchema()
df_silver.show(20, truncate=False)


root
 |-- geolocation_zip_code_prefix: string (nullable = false)
 |-- geolocation_city: string (nullable = true)
 |-- geolocation_state: string (nullable = true)
 |-- audit_timestamp: timestamp (nullable = false)

+---------------------------+----------------+-----------------+--------------------------+
|geolocation_zip_code_prefix|geolocation_city|geolocation_state|audit_timestamp           |
+---------------------------+----------------+-----------------+--------------------------+
|01037                      |sao paulo       |SP               |2025-12-12 09:25:05.772249|
|01046                      |sao paulo       |SP               |2025-12-12 09:25:05.772249|
|01046                      |sao paulo       |SP               |2025-12-12 09:25:05.772249|
|01041                      |sao paulo       |SP               |2025-12-12 09:25:05.772249|
|01035                      |sao paulo       |SP               |2025-12-12 09:25:05.772249|
|01012                      |sao paulo       |SP  