## Postcode Data Validation

In [56]:
# Import libraries
import re
import pandas as pd

In [57]:
# Read in dataset
postcodes = pd.read_csv('datasets/sp_addresses.csv')

In [58]:
# Check original data shape, types and missing entries
postcodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57049 entries, 0 to 57048
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       57049 non-null  int64  
 1   tipo_logr        57049 non-null  object 
 2   logr_nome        57049 non-null  object 
 3   logr_compl       57049 non-null  object 
 4   numero           41 non-null     object 
 5   bairro           57049 non-null  object 
 6   nome_localidade  57049 non-null  object 
 7   sigla_uf         57049 non-null  object 
 8   cep              57049 non-null  float64
 9   latitude         57002 non-null  float64
 10  longitude        57002 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 4.8+ MB


In [59]:
# Read dataset head
postcodes.head()

Unnamed: 0.1,Unnamed: 0,tipo_logr,logr_nome,logr_compl,numero,bairro,nome_localidade,sigla_uf,cep,latitude,longitude
0,543013,Rua,George Dantu,Rua George Dantu,,Chácara Maria Trindade,São Paulo,SP,5275051.0,-23.422697,-46.812013
1,543014,Rua,Maria Augusta Fiske,Rua Maria Augusta Fiske,,Chácara Maria Trindade,São Paulo,SP,5275055.0,-23.424251,-46.812687
2,543015,Rua,Doutor Nilo Cairo,Rua Doutor Nilo Cairo,,Chácara Maria Trindade,São Paulo,SP,5275060.0,-23.411648,-46.815519
3,543016,Rua,Leonel Martiniano,Rua Leonel Martiniano,,Chácara Maria Trindade,São Paulo,SP,5275065.0,-23.422471,-46.823359
4,543017,Rua,André Polak,Rua André Polak,,Chácara Maria Trindade,São Paulo,SP,5275070.0,-23.422164,-46.807962


In [60]:
# Merge columns 'tipo_logr' and 'logr_name' into a single column
postcodes['rua'] = postcodes['tipo_logr'] + ' ' + postcodes['logr_nome']
postcodes['rua'] = postcodes['rua'].str.title().str.strip()

In [61]:
# Create function to remove house numbers to prevent home identification and preserve private information
def cleanup_address_number(address):
    return re.sub(r'[0-9]+', '', address).replace(',', '').strip()
     
# Apply string formatting function to the column 'Rua'
postcodes['rua'] = postcodes['rua'].apply(cleanup_address_number)

# Validate transformation
postcodes

Unnamed: 0.1,Unnamed: 0,tipo_logr,logr_nome,logr_compl,numero,bairro,nome_localidade,sigla_uf,cep,latitude,longitude,rua
0,543013,Rua,George Dantu,Rua George Dantu,,Chácara Maria Trindade,São Paulo,SP,5275051.0,-23.422697,-46.812013,Rua George Dantu
1,543014,Rua,Maria Augusta Fiske,Rua Maria Augusta Fiske,,Chácara Maria Trindade,São Paulo,SP,5275055.0,-23.424251,-46.812687,Rua Maria Augusta Fiske
2,543015,Rua,Doutor Nilo Cairo,Rua Doutor Nilo Cairo,,Chácara Maria Trindade,São Paulo,SP,5275060.0,-23.411648,-46.815519,Rua Doutor Nilo Cairo
3,543016,Rua,Leonel Martiniano,Rua Leonel Martiniano,,Chácara Maria Trindade,São Paulo,SP,5275065.0,-23.422471,-46.823359,Rua Leonel Martiniano
4,543017,Rua,André Polak,Rua André Polak,,Chácara Maria Trindade,São Paulo,SP,5275070.0,-23.422164,-46.807962,Rua André Polak
...,...,...,...,...,...,...,...,...,...,...,...,...
57044,1104300,Rua,"Belmiro Valverde, 218","Rua Belmiro Valverde, 218",,Lajeado,São Paulo,SP,8450959.0,-23.541529,-46.410636,Rua Belmiro Valverde
57045,1104333,Travessa,Marlucia Augusta fagundes,Travessa Marlucia Augusta fagundes,,Jardim São Paulo(Zona Leste),São Paulo,SP,8461375.0,-23.560021,-46.405634,Travessa Marlucia Augusta Fagundes
57046,1104335,Rua,Flor Bonita,Rua Flor Bonita,,Cidade Popular,São Paulo,SP,8461137.0,-23.559476,-46.588920,Rua Flor Bonita
57047,1104338,Rua,Dona Faustina Leonardo Januário,Rua Dona Faustina Leonardo Januário,,Conjunto Habitacional Juscelino Kubitschek,São Paulo,SP,8465055.0,-23.555771,-46.639557,Rua Dona Faustina Leonardo Januário


In [62]:
# Validate column 'nome_localidade'
postcodes['nome_localidade'].value_counts()

São Paulo    57049
Name: nome_localidade, dtype: int64

In [63]:
# Keep relevant columns
postcodes = postcodes[['rua', 'bairro', 'cep', 'latitude', 'longitude']]

In [65]:
# Rename columns appropriately
postcodes.columns = ['address', 'ward', 'postcode', 'latitude', 'longitude']

In [66]:
# Save validated dataset to csv
postcodes.to_csv('datasets/sp_postcodes.csv')