## Residential properties for sale in Sao Paulo

In [1]:
# Import library
import pandas as pd

# Read in dataset
properties = pd.read_csv('datasets/Sao_Paulo.csv')

# Drop duplicates from properties
properties.drop_duplicates()

# Print the total number of residential properties
print('\nTotal number of properties announced in Sao Paulo dataset = ', len(properties.index))

# Have a look at a random sample of 10 rows
properties.sample(10)


Total number of properties announced in Sao Paulo dataset =  10008


Unnamed: 0,Rua,Bairro,Cidade,Metragem,Quartos,Banheiros,Vagas,Valor
6797,,Cidade Monções,São Paulo,150,3,4,2,R$ 1.990.000
4120,,Santo Amaro,São Paulo,180,3,5,3,R$ 1.100.000
3550,Rua Ubaíra,Indianópolis,São Paulo,200,3,4,4,R$ 1.990.000
6679,Rua Carlos Petit,Vila Mariana,São Paulo,295,3,1,3,R$ 2.500.000
7382,,Cidade Jardim,São Paulo,605,4,4,4,R$ 15.000\n /Mês
291,Rua General Vitorino Monteiro,Vila Romana,São Paulo,450,5,5,4,R$ 2.100.000
9214,"Rua Pirajuia, 0",Jaguaré,São Paulo,230,4,4,4,R$ 700.000
3726,Rua Rolf Laube,Cambuci,São Paulo,170,4,4,2,R$ 900.000
6544,Rua Antônio de Macedo Soares,Campo Belo,São Paulo,380,3,4,5,R$ 2.819.000
3674,,Jardim Europa,São Paulo,705,4,4,5,R$ 9.500.000


### Data cleaning
By looking at a random sample of the dataset rows (from the above task), we observe that some entries in the columns like 'Rua' (address) and 'Bairro' (ward) may have null or inconsistent values (address with/without number), and that may impact the geographic generalization of the results. Also, the column 'Valor' (price) has special characters ('$', '.' , '/'), that may hinder future mathematical calculations.
Hence, the first step is to drop the rows without an address, and edit data into a consistent format, removing house numbers for identity protection. Then, we should confirm if any of the ward missing values can be filled based on entries with the same address values. Finally, we should remove the special characters in the 'Value' column.

In [2]:
# Keep rows with non-null addresses
prop_with_address = properties[properties['Rua'].notna()]

# Edit addresses into a consistant format
prop_with_address['Rua'] = prop_with_address['Rua'].str.replace('\d+', '').str.replace(',', '').str.strip()

# Lookup addresses with ward missing values
prop_ward_missing = prop_with_address[prop_with_address['Bairro'].isna()]

# List of addresses with ward missing values
addresses = prop_ward_missing['Rua'].tolist()

# Lookup if missing values appear somewhere in the dataset
same_address = prop_with_address[prop_with_address['Rua'].isin(addresses)]

# Dictionary of missing values
same_address = same_address[same_address['Bairro'].notna()]
same_address = same_address.sort_values(['Rua', 'Bairro'])
full_address = dict(zip(same_address.Rua, same_address.Bairro))
full_address['Rua Professor Lúcio Martins Rodrigues'] = 'Morumbi'   # As we could not find other properties at this address, we lookedup in google maps and added this entry manually
print(full_address)

# Fill in ward missing values based on dictionary references
prop_with_address['Bairro'] = prop_with_address['Bairro'].fillna(prop_with_address['Rua'].apply(lambda x: full_address.get(x)))

# Split column 'Valor'
prop_with_address[['Moeda', 'Valor_Anuncio', 'Tipo_Anuncio']] = prop_with_address['Valor'].str.split(expand=True)

# Filter properties for sale - rentals contain values 'per month/per year' in the column 'Tipo_Anuncio', therefore we will only keep null entries.
sale_properties = prop_with_address[prop_with_address['Tipo_Anuncio'].isna()]

# Convert 'Prices' format from string to float
sale_properties["Valor_BRL"] = sale_properties["Valor_Anuncio"].str.replace(".","").astype(float)

# Config display to suppress decimal cases and scientific notation of floats
pd.set_option("display.precision", 2)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Keep columns relevants to the project
sale_properties = sale_properties.drop(['Valor', 'Moeda', 'Valor_Anuncio', 'Tipo_Anuncio'], axis=1)

# Print a summary of the properties dataframe
sale_properties.info()

{'Avenida Comendador Adibo Ares': 'Morumbi', 'Rua Alvorada do Sul': 'Jardim Guedala', 'Rua Madalena de Morais': 'Jardim Leonor', 'Rua Pacobá': 'Jardim Panorama', 'Rua Professor Eduardo Monteiro': 'Jardim Leonor', 'Rua Santo Eufredo': 'Jardim Guedala', 'Rua Vergueiro': 'Vila Firmiano Pinto', 'Rua Professor Lúcio Martins Rodrigues': 'Morumbi'}
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6107 entries, 0 to 10007
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Rua        6107 non-null   object 
 1   Bairro     6107 non-null   object 
 2   Cidade     6107 non-null   object 
 3   Metragem   6107 non-null   int64  
 4   Quartos    6107 non-null   int64  
 5   Banheiros  6107 non-null   int64  
 6   Vagas      6107 non-null   int64  
 7   Valor_BRL  6107 non-null   float64
dtypes: float64(1), int64(4), object(3)
memory usage: 429.4+ KB


  prop_with_address['Rua'] = prop_with_address['Rua'].str.replace('\d+', '').str.replace(',', '').str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prop_with_address['Rua'] = prop_with_address['Rua'].str.replace('\d+', '').str.replace(',', '').str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prop_with_address['Bairro'] = prop_with_address['Bairro'].fillna(prop_with_address['Rua'].apply(lambda x: full_address.get(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead



### postcodes df


In [None]:
# Have a look at a random sample of 10 rows
print(postcodes.sample(10))

# Filter postcodes of Sao Paulo
postcodes_sp = postcodes.query("nome_localidade == 'São Paulo'")

# Drop duplicates
postcodes_sp.drop_duplicates()

# Format addresses to enable join
postcodes_sp['rua'] = postcodes_sp['tipo_logr'] + ' ' + postcodes_sp['logr_nome']
postcodes_sp['rua'] = postcodes_sp['rua'].str.title().str.strip()
postcodes_sp.head()

# Keep columns relevant to the project
postcodes_sp = postcodes_sp.drop(['Unnamed: 0', 'tipo_logr', 'logr_nome', 'logr_compl', 'numero', 'nome_localidade', 'sigla_uf'], axis=1)

# Rename columns to improve data legibility
postcodes_sp.columns = ['Bairro', 'CEP', 'Latitude', 'Longitude', 'Rua']
postcodes_sp.sample(10)

       Unnamed: 0 tipo_logr                    logr_nome  \
44608      595181     Praça      Vidal Antônio de Castro   
20782      571355       Rua            Francisco Tapajós   
2380       545393    Acesso            Andorinha Cristal   
18708      569281       Rua               Espírito Santo   
42866      593439   Avenida                   Taquandava   
36144      586717       Rua     Paulino Pacheco de Mello   
40125      590698       Rua                  Ruy Camargo   
26904      577477       Rua          José Barros Magaldi   
13566      564139       Rua          Conceição dos Ouros   
17957      568530       Rua  Elizabeth Constantino Horii   

                                          logr_compl numero  \
44608                  Praça Vidal Antônio de Castro    NaN   
20782            Rua Francisco Tapajós - até 449/450    NaN   
2380                        Acesso Andorinha Cristal    NaN   
18708                             Rua Espírito Santo    NaN   
42866                   

Unnamed: 0.1,Unnamed: 0,tipo_logr,logr_nome,logr_compl,numero,bairro,nome_localidade,sigla_uf,cep,latitude,longitude,rua
0,543013,Rua,George Dantu,Rua George Dantu,,Chácara Maria Trindade,São Paulo,SP,5275051.0,-23.42,-46.81,Rua George Dantu
1,543014,Rua,Maria Augusta Fiske,Rua Maria Augusta Fiske,,Chácara Maria Trindade,São Paulo,SP,5275055.0,-23.42,-46.81,Rua Maria Augusta Fiske
2,543015,Rua,Doutor Nilo Cairo,Rua Doutor Nilo Cairo,,Chácara Maria Trindade,São Paulo,SP,5275060.0,-23.41,-46.82,Rua Doutor Nilo Cairo
3,543016,Rua,Leonel Martiniano,Rua Leonel Martiniano,,Chácara Maria Trindade,São Paulo,SP,5275065.0,-23.42,-46.82,Rua Leonel Martiniano
4,543017,Rua,André Polak,Rua André Polak,,Chácara Maria Trindade,São Paulo,SP,5275070.0,-23.42,-46.81,Rua André Polak


### demographics df

In [None]:
# Have a look at a random sample of 10 rows
print(demographics.sample(10))

# Drop invalid entry column
demographics.dropna(how='all', axis=1, inplace=True)

# Rename columns to improve data legibility
demographics.columns = ['CD_SETOR', 'DISTRITO', 'DOMICILIOS', 'MORADORES_SETOR', 'MORADORES_DOMICILIO', 'RENDA_MENSAL']

demographics.sample(10)

demographics.to_csv('datasets/demographics.csv')

             Cod_setor   Nome_do_distrito   V001   V002  V003    V005
7647   355030838000397          JABAQUARA  74.00 271.00  3.66  696.18
11933  355030863000083           PIRITUBA 291.00 908.00  3.12 1590.11
2359   355030817000313        CAMPO LIMPO 134.00 437.00  3.26  683.18
2370   355030817000324        CAMPO LIMPO 116.00 353.00  3.04  898.02
7339   355030838000085          JABAQUARA 214.00 661.00  3.09 1356.54
13996  355030873000069         SÃO MATEUS 203.00 632.00  3.11 1202.81
16069  355030883000219       VILA ANDRADE 154.00 578.00  3.75  420.45
7011   355030837000039           ITAQUERA 182.00 619.00  3.40  950.47
15259  355030880000059            TATUAPÉ 321.00 919.00  2.86 2326.32
4061   355030825000023  CIDADE TIRADENTES 194.00 663.00  3.42  443.98


Unnamed: 0,CD_SETOR,DISTRITO,DOMICILIOS,MORADORES_SETOR,MORADORES_DOMICILIO,RENDA_MENSAL
8820,355030844000219,JARDIM HELENA,260.0,941.0,3.62,612.28
8041,355030842000096,JARAGUÁ,205.0,749.0,3.65,782.95
15981,355030883000129,VILA ANDRADE,88.0,193.0,2.19,4516.99
9438,355030847000019,JOSÉ BONIFÁCIO,222.0,645.0,2.91,1287.45
13880,355030872000142,SÃO LUCAS,194.0,635.0,3.27,1378.7
3769,355030823000149,CIDADE DUTRA,265.0,888.0,3.35,1688.74
4901,355030829000028,FREGUESIA DO Ó,278.0,907.0,3.26,1518.6
3004,355030820000004,CARRÃO,311.0,953.0,3.06,1352.26
10566,355030855000077,PARELHEIROS,167.0,591.0,3.54,1047.62
8182,355030842000273,JARAGUÁ,514.0,2136.0,4.16,530.94


### geoid df

In [None]:
# Merge properties and postcodes
property_codes = sale_properties.merge(postcodes_sp, on=['Rua', 'Bairro'])

# Drop duplicates from properties
property_codes.drop_duplicates()

# Merged dataset info
property_codes.info()

# Have a look at a random sample of 10 rows
print(property_codes.sample(10))

# Save merged dataframe as csv
property_codes.to_csv('./datasets/property_addresses.csv')

# Merge demographics and geoids
geo_stats = geoid_sp.merge(demographics, on = 'CD_SETOR')

# Merged dataset info
geo_stats.info()

# Have a look at a random sample of 10 rows
print(geo_stats.sample(10))

# Save merged dataframe as csv
geo_stats.to_csv('./datasets/geo_stats.csv')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6627 entries, 0 to 6626
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Rua        6627 non-null   object 
 1   Bairro     6627 non-null   object 
 2   Cidade     6627 non-null   object 
 3   Metragem   6627 non-null   int64  
 4   Quartos    6627 non-null   int64  
 5   Banheiros  6627 non-null   int64  
 6   Vagas      6627 non-null   int64  
 7   Valor_BRL  6627 non-null   float64
 8   CEP        6627 non-null   float64
 9   Latitude   6627 non-null   float64
 10  Longitude  6627 non-null   float64
dtypes: float64(4), int64(4), object(3)
memory usage: 621.3+ KB
                                  Rua               Bairro     Cidade  \
4302           Rua Giovanni Carnovali       Vila Caraguatá  São Paulo   
489               Rua Cristiano Viana      Cerqueira César  São Paulo   
6095  Rua Doutor Gentil Leite Martins  Vila Nova Caledônia  São Paulo   
4615             