In [1]:
import pandas as pd
from nominatim import coordinates_dataframe

# Importing

In [2]:
# Read data
df = pd.read_csv("apartments_data_scraped.csv")

# Copy for future comparison 
df_original = df.copy()

## Disclaimer on API calls

In [3]:
# Call function to get the coordinates of each street via API
'''
!!! ATTENTION !!!

The nominatim API has a limit rate of one call per second, so the result of the function is saved in a file called apartments_data.csv.

I already did that, so the function will be commented down bellow, if you need to rerun this file, uncomment the line. 

'''
# coordinates_dataframe(df)

'\n!!! ATTENTION !!!\n\nThe nominatim API has a limit rate of one call per second, so the result of the function is saved in a file called apartments_data.csv.\n\nI already did that, so the function will be commented down bellow, if you need to rerun this file, uncomment the line. \n\n'

In [4]:
# Get dataframe with coords
df = pd.read_csv('apartments_data.csv')

# Initial exploration

In [5]:
# First insights on data
print(df.head(10))
df.info()

   Unnamed: 0.2  Unnamed: 0.1  Unnamed: 0                    neighborhood  \
0             0             0           0                   Monte Castelo   
1             1             1           1                   Monte Castelo   
2             2             2           2                        Vila Ema   
3             3             3           3     Parque Residencial Aquarius   
4             4             4           4                   Monte Castelo   
5             5             5           5  Condomínio Recanto da Baronesa   
6             6             6           6                 Jardim Satélite   
7             7             7           7             Jardim Oswaldo Cruz   
8             8             8           8             Jardim Oswaldo Cruz   
9             9             9           9                 Jardim Satélite   

                   city                                 street  \
0   São José dos Campos                Rua Vinte e Um de Abril   
1   São José dos Cam

In [6]:
# Drop unnamed columns
df = df.drop(columns=['Unnamed: 0.1',  'Unnamed: 0', 'Unnamed: 0.2',])
print(df.head())

                  neighborhood                  city  \
0                Monte Castelo   São José dos Campos   
1                Monte Castelo   São José dos Campos   
2                     Vila Ema   São José dos Campos   
3  Parque Residencial Aquarius   São José dos Campos   
4                Monte Castelo   São José dos Campos   

                                  street  \
0                Rua Vinte e Um de Abril   
1              Rua Capitão Raul Fagundes   
2                      Rua Padre Rodolfo   
3  Rua Doutor Jorge de Oliveira Coutinho   
4                         Rua Guararapes   

                                         description     area rooms parking  \
0  LINDO APARTAMENTO\n54 M² COM VARANDA \n-2 QUAR...       54     2       1   
1  O Edifício Grand Castelo foi projetado para qu...  63 - 65     2       1   
2  Apartamento - Jardim Apolo - Residencial Allur...      120     3       2   
3  Apartamento de luxo com 4 suítes à venda, 360m...      360     4       4   
4  

## Getting columns for PCA

There're 06 columns that we expected to be numerical, and so we can use in our PCA analysis:
- Area
- Rooms
- Parking
- Bathrooms
- Lat
- Lon

Only lat and lon are presented as numerical in cel #19. Let's see what's happening in the other columns

In [7]:
# Get the numerical variables for a PCA test
df = df[['area', 'rooms', 'parking', 'bathrooms', 'lat', 'lon']]

## Evaluating Area

In [8]:
# Create a "is_numeric" column for each column that suposed to be numeric but isn't
columns = ['area', 'rooms', 'parking', 'bathrooms']

for column in columns:
    
    column_name = column + '_is_numeric'
    df[column_name] = pd.to_numeric(df[column], errors='coerce')

print(df.head(10))

      area  rooms parking bathrooms        lat        lon  area_is_numeric  \
0       54      2       1         1 -23.141757 -45.782579             54.0   
1  63 - 65      2       1         2 -23.188497 -45.874192              NaN   
2      120      3       2         3 -23.203881 -45.902105            120.0   
3      360      4       4         5        NaN        NaN            360.0   
4       54      2       1         1 -23.183992 -45.876211             54.0   
5       55      2       1         2        NaN        NaN             55.0   
6      165      3       3         1 -23.216123 -45.888546            165.0   
7      104      3       2         3 -23.201250 -45.883484            104.0   
8  56 - 73  2 - 3   1 - 2         2 -23.200832 -45.883038              NaN   
9       88      3       2         2 -23.228164 -45.879398             88.0   

   rooms_is_numeric  parking_is_numeric  bathrooms_is_numeric  
0               2.0                 1.0                   1.0  
1            

In [9]:
# Analyzing area column
non_numeric_area = df[df['area_is_numeric'].isna()]
non_numeric_area = non_numeric_area[['area', 'area_is_numeric']]
number_non_numeric_area = len(non_numeric_area)
print('Number of non-numeric areas: ', number_non_numeric_area)
print(non_numeric_area.head(10))

Number of non-numeric areas:  24
             area  area_is_numeric
1         63 - 65              NaN
8         56 - 73              NaN
16        50 - 51              NaN
23        56 - 73              NaN
31        56 - 73              NaN
46        50 - 51              NaN
96        52 - 53              NaN
114  70 - 71 - 75              NaN
209       56 - 57              NaN
228       77 - 80              NaN


24 apartments have more than one area separated by "-", this means the price should vary between each area. Since we only have the price for one area, and we don't know wich one it represents, we will exclude these apartments from our analysis  


In [10]:
mask = ~df['area_is_numeric'].isna()
df = df[mask]
print(df.head(10))

   area rooms parking bathrooms        lat        lon  area_is_numeric  \
0    54     2       1         1 -23.141757 -45.782579             54.0   
2   120     3       2         3 -23.203881 -45.902105            120.0   
3   360     4       4         5        NaN        NaN            360.0   
4    54     2       1         1 -23.183992 -45.876211             54.0   
5    55     2       1         2        NaN        NaN             55.0   
6   165     3       3         1 -23.216123 -45.888546            165.0   
7   104     3       2         3 -23.201250 -45.883484            104.0   
9    88     3       2         2 -23.228164 -45.879398             88.0   
10   85     2       2         2 -23.184158 -45.954734             85.0   
11   64     2       1         1 -23.172986 -45.894570             64.0   

    rooms_is_numeric  parking_is_numeric  bathrooms_is_numeric  
0                2.0                 1.0                   1.0  
2                3.0                 2.0               

## Evaluating Rooms

In [11]:
# Analyzing rooms column
non_numeric_rooms = df[df['rooms_is_numeric'].isna()]
non_numeric_rooms = non_numeric_rooms[['rooms', 'rooms_is_numeric']]
number_non_numeric_rooms = len(non_numeric_rooms)
print('Number of non-numeric rooms: ', number_non_numeric_rooms)
print(non_numeric_rooms.head(10))

Number of non-numeric rooms:  4
     rooms  rooms_is_numeric
393  2 - 3               NaN
415  2 - 3               NaN
532  2 - 3               NaN
983  1 - 2               NaN


There're 4 rows with rooms that vary and we don't know wich one represents the price of the apartment. Thus, we will also exclude those datas from our analysis

In [12]:
mask = ~df['rooms_is_numeric'].isna()
df = df[mask]
print(df.head(10))

   area rooms parking bathrooms        lat        lon  area_is_numeric  \
0    54     2       1         1 -23.141757 -45.782579             54.0   
2   120     3       2         3 -23.203881 -45.902105            120.0   
3   360     4       4         5        NaN        NaN            360.0   
4    54     2       1         1 -23.183992 -45.876211             54.0   
5    55     2       1         2        NaN        NaN             55.0   
6   165     3       3         1 -23.216123 -45.888546            165.0   
7   104     3       2         3 -23.201250 -45.883484            104.0   
9    88     3       2         2 -23.228164 -45.879398             88.0   
10   85     2       2         2 -23.184158 -45.954734             85.0   
11   64     2       1         1 -23.172986 -45.894570             64.0   

    rooms_is_numeric  parking_is_numeric  bathrooms_is_numeric  
0                2.0                 1.0                   1.0  
2                3.0                 2.0               

## Evaluating Parking

In [13]:
# Analyzing parking column
non_numeric_parking = df[df['parking_is_numeric'].isna()]
non_numeric_parking = non_numeric_parking[['parking', 'parking_is_numeric']]
number_non_numeric_parking = len(non_numeric_parking)
print('Number of non-numeric parking: ', number_non_numeric_parking)
print(non_numeric_parking.head(10))

Number of non-numeric parking:  49
    parking  parking_is_numeric
94    0 - 1                 NaN
115     NaN                 NaN
172   2 - 4                 NaN
222     NaN                 NaN
270     NaN                 NaN
277     NaN                 NaN
294   0 - 2                 NaN
303     NaN                 NaN
328     NaN                 NaN
330     NaN                 NaN


49 apartments either have no information about parking or do not have any information at all. For that, we will also exclude them from our analisys

In [14]:
mask = ~df['parking_is_numeric'].isna()
df = df[mask]
print(df.head(10))

   area rooms parking bathrooms        lat        lon  area_is_numeric  \
0    54     2       1         1 -23.141757 -45.782579             54.0   
2   120     3       2         3 -23.203881 -45.902105            120.0   
3   360     4       4         5        NaN        NaN            360.0   
4    54     2       1         1 -23.183992 -45.876211             54.0   
5    55     2       1         2        NaN        NaN             55.0   
6   165     3       3         1 -23.216123 -45.888546            165.0   
7   104     3       2         3 -23.201250 -45.883484            104.0   
9    88     3       2         2 -23.228164 -45.879398             88.0   
10   85     2       2         2 -23.184158 -45.954734             85.0   
11   64     2       1         1 -23.172986 -45.894570             64.0   

    rooms_is_numeric  parking_is_numeric  bathrooms_is_numeric  
0                2.0                 1.0                   1.0  
2                3.0                 2.0               

## Evaluating Bathrooms

In [15]:
# Analyzing bathroom column
non_numeric_bathrooms = df[df['bathrooms_is_numeric'].isna()]
non_numeric_bathrooms = non_numeric_bathrooms[['bathrooms', 'bathrooms_is_numeric']]
number_non_numeric_bathrooms = len(non_numeric_bathrooms)
print('Number of non-numeric bathrooms: ', number_non_numeric_bathrooms)
print(non_numeric_bathrooms.head(10))

Number of non-numeric bathrooms:  33


AttributeError: 'int' object has no attribute 'head'

33 apartments have ambiguos infromation about number of bathrooms, we will exclude them from the analysis

In [None]:
mask = ~df['bathrooms_is_numeric'].isna()
df = df[mask]
print(df.head(10))

## Evaluating Lat and Lon

In [None]:
# Exclude apartments without coordinates
df = df[~(df['lat'].isna()) | ~(df['lon'].isna())]
number_non_numeric_coords = len(df[(df['lat'].isna()) | (df['lon'].isna())])
total_cleaned_observations = len(df)
print('Number of non-numeric coords: ', number_non_numeric_coords)

print('Complete observations : ', len(df))

## Conclusion