Authors: Luís Eduardo Anunciado Silva, Mayra Dantas de Azevedo



---



In [1]:
!pip install unidecode
!pip install geopandas



In [0]:
import requests
import json

import pandas as pd
import numpy as np

import geopandas as gpd
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
import unidecode

## Schools and hospitals

Using the [Overpass API](https://wiki.openstreetmap.org/wiki/Overpass_API), we get the schools and hospitals registered in Natal, RN.

In [0]:
overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = """
[out:json][timeout:25]; area(3600301091)->.searchArea; ( node['amenity'='hospital'](area.searchArea); node['amenity'='school'](area.searchArea);relation['admin_level'='10'](area.searchArea); ); out body; >; out center;
"""
response = requests.get(overpass_url, 
                        params={'data': overpass_query})
data = response.json()

In [0]:
df = pd.DataFrame(data['elements'])

In [5]:
df.head()

Unnamed: 0,center,id,lat,lon,members,nodes,tags,type
0,,501170977,-5.869024,-35.234268,,,{'amenity': 'school'},node
1,,501170997,-5.82361,-35.222611,,,"{'amenity': 'school', 'name': 'Centro de Atenç...",node
2,,501171016,-5.871019,-35.221614,,,"{'amenity': 'school', 'name': 'Piaget'}",node
3,,501619315,-5.816246,-35.204844,,,"{'amenity': 'hospital', 'name': 'Hospital da U...",node
4,,501784871,-5.812943,-35.210726,,,{'amenity': 'school'},node


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2770 entries, 0 to 2769
Data columns (total 8 columns):
center     407 non-null object
id         2770 non-null int64
lat        2327 non-null float64
lon        2327 non-null float64
members    36 non-null object
nodes      407 non-null object
tags       499 non-null object
type       2770 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 173.2+ KB


Regarding the processing of the amenitites, we create a column that keeps the category of the amenity: school or hospital 

In [0]:
def get_amenity(row):
  try:
    amenity = row['amenity']
    return amenity
  except:
    return np.nan
  
def get_name(row):
  try:
    name = row['name']
    return name
  except:
    return np.nan

In [0]:
df['amenity'] = df.tags.apply(get_amenity)
df['name'] = df.tags.apply(get_name)

Now, let's take a look at the data.

In [9]:
df['amenity'].value_counts()

school      38
hospital    21
bench        1
Name: amenity, dtype: int64

In [0]:
df['amenity'].dropna(inplace=True)

In [11]:
df.head()

Unnamed: 0,center,id,lat,lon,members,nodes,tags,type,amenity,name
0,,501170977,-5.869024,-35.234268,,,{'amenity': 'school'},node,school,
1,,501170997,-5.82361,-35.222611,,,"{'amenity': 'school', 'name': 'Centro de Atenç...",node,school,Centro de Atenção Integral a Criança e ao Adol...
2,,501171016,-5.871019,-35.221614,,,"{'amenity': 'school', 'name': 'Piaget'}",node,school,Piaget
3,,501619315,-5.816246,-35.204844,,,"{'amenity': 'hospital', 'name': 'Hospital da U...",node,hospital,Hospital da UNIMED
4,,501784871,-5.812943,-35.210726,,,{'amenity': 'school'},node,school,


In [12]:
df.loc[1, 'tags']

{'amenity': 'school',
 'name': 'Centro de Atenção Integral a Criança e ao Adolescente',
 'short_name': 'CAIC'}

## Recovering data about indicators from the API

In this section, we use the [Sidra API](http://api.sidra.ibge.gov.br) to recover the neighborhoods, as well as some informations about the average income and other indicators.

In [0]:
# households by neighborhood in Natal (2010)
# from Sidra's Table 185 (http://api.sidra.ibge.gov.br/desctabapi.aspx?c=185)
headers = {
    'Content-Type': 'application/json;charset=UTF-8',
    'User-Agent': 'google-colab',
    'Accept': 'application/json, text/plain, */*',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
    'Connection': 'keep-alive',
}


endnode = "http://api.sidra.ibge.gov.br/values/t/3170/p/2010/v/allxp/N102/in%20n6%202408102"

response = requests.get(endnode,headers=headers)

# for curiosity, take a look in this variable
raw_data = response.json()

In [14]:
raw_data[0]

{'D1C': 'Ano (Código)',
 'D1N': 'Ano',
 'D2C': 'Variável (Código)',
 'D2N': 'Variável',
 'D3C': 'Bairro (Código)',
 'D3N': 'Bairro',
 'D4C': 'Situação do domicílio (Código)',
 'D4N': 'Situação do domicílio',
 'D5C': 'Sexo (Código)',
 'D5N': 'Sexo',
 'D6C': 'Grupo de idade (Código)',
 'D6N': 'Grupo de idade',
 'MC': 'Unidade de Medida (Código)',
 'MN': 'Unidade de Medida',
 'V': 'Valor'}

In [0]:
neighbourhood_df = pd.DataFrame(raw_data)

In [17]:
neighbourhood_df.head()

Unnamed: 0,D1C,D1N,D2C,D2N,D3C,D3N,D4C,D4N,D5C,D5N,D6C,D6N,MC,MN,V
0,Ano (Código),Ano,Variável (Código),Variável,Bairro (Código),Bairro,Situação do domicílio (Código),Situação do domicílio,Sexo (Código),Sexo,Grupo de idade (Código),Grupo de idade,Unidade de Medida (Código),Unidade de Medida,Valor
1,2010,2010,841,"Pessoas de 10 anos ou mais de idade, com rendi...",2408102001,Santos Reis - Natal - RN,6795,Total,6794,Total,95253,Total,45,Pessoas,2989
2,2010,2010,841,"Pessoas de 10 anos ou mais de idade, com rendi...",2408102002,Praia do Meio - Natal - RN,6795,Total,6794,Total,95253,Total,45,Pessoas,2810
3,2010,2010,841,"Pessoas de 10 anos ou mais de idade, com rendi...",2408102003,Rocas - Natal - RN,6795,Total,6794,Total,95253,Total,45,Pessoas,5806
4,2010,2010,841,"Pessoas de 10 anos ou mais de idade, com rendi...",2408102004,Ribeira - Natal - RN,6795,Total,6794,Total,95253,Total,45,Pessoas,1453


In [18]:
neigh_id = []
neigh_name = []
neigh_house = []

# first position is only the header
for data in raw_data[1:]:
  # checks the variable (people, mean or median)
  if( int(data["D2C"]) == 842):
    # Variable is the mean of the income
    neigh_id.append(int(data["D3C"]))
    neigh_house.append(float(data["V"]))
    neigh_name.append(data["D3N"].split(" -")[0])
  
neigh_df = pd.DataFrame.from_dict({"id": neigh_id,
                                  "name":neigh_name,
                                  "income": neigh_house})
neigh_df.head()

Unnamed: 0,id,name,income
0,2408102001,Santos Reis,984.31
1,2408102002,Praia do Meio,1658.16
2,2408102003,Rocas,969.39
3,2408102004,Ribeira,2825.82
4,2408102005,Petrópolis,4736.63


In [0]:
neigh_df['name_norm'] = neigh_df.name.str.replace("ú", "u").str.replace("ó","o")

In [20]:
neigh_df.query("name == 'Pitimbú' ")

Unnamed: 0,id,name,income,name_norm
22,2408102023,Pitimbú,2014.2,Pitimbu


## Placing the points at neighborhoods

In this section, we use the geojson from Natal to discover where each amenity from the first dataframe is placed.

In [0]:
# load the GeoJSON data and use 'UTF-8'encoding
geojson_natal_file = requests.get('https://github.com/nymarya/data-science-one/blob/master/Lesson%2314/natal.geojson?raw=true')
geo_json_natal = geojson_natal_file.json()

Now, we will separate the geometry of each neighbourhood so we can check whether each point is within which neighbourhood.

The [geopandas](http://geopandas.org/) module will help us to recover the polygon that describes the boundaries of each area. Therefore, we are able to check wheter a point (longitud, latitud)  is inside a area.

In [0]:
with open('natal.geojson', 'w') as outfile:  
    json.dump(geo_json_natal, outfile)

In [0]:
geojson_df = gpd.read_file("natal.geojson")

In [24]:
geojson_df.head()

Unnamed: 0,id,@id,admin_level,boundary,is_in,name,place,type,alt_name,wikidata,geometry
0,relation/388146,relation/388146,10,administrative,Natal,Pitimbu,suburb,boundary,,,"POLYGON ((-35.2437294 -5.8420332, -35.2437895 ..."
1,relation/388147,relation/388147,10,administrative,Natal,Planalto,suburb,boundary,,,"POLYGON ((-35.2538208 -5.8662863, -35.2536119 ..."
2,relation/397022,relation/397022,10,administrative,Natal,Ponta Negra,suburb,boundary,,,"POLYGON ((-35.1890225 -5.8907943, -35.1889493 ..."
3,relation/1230018,relation/1230018,10,administrative,Natal,Neópolis,suburb,boundary,,,"POLYGON ((-35.2003077 -5.8734101, -35.1999644 ..."
4,relation/1230020,relation/1230020,10,administrative,Natal,Capim Macio,suburb,boundary,,,"POLYGON ((-35.1894621 -5.8724223, -35.1890659 ..."


In [0]:
df['neighbourhood'] = pd.Series()

for index, data in geojson_df.iterrows():
  # Get the polygon
  polygon = Polygon(data['geometry'])
  for i, d in df.iterrows():
    point = Point(float(d['lon']), float(d['lat']))
    if polygon.contains(point) :
      # Save the name
      df.loc[i, 'neighbourhood'] = unidecode.unidecode(data['name'])

In [26]:
df.head()

Unnamed: 0,center,id,lat,lon,members,nodes,tags,type,amenity,name,neighbourhood
0,,501170977,-5.869024,-35.234268,,,{'amenity': 'school'},node,school,,Pitimbu
1,,501170997,-5.82361,-35.222611,,,"{'amenity': 'school', 'name': 'Centro de Atenç...",node,school,Centro de Atenção Integral a Criança e ao Adol...,Lagoa Nova
2,,501171016,-5.871019,-35.221614,,,"{'amenity': 'school', 'name': 'Piaget'}",node,school,Piaget,Pitimbu
3,,501619315,-5.816246,-35.204844,,,"{'amenity': 'hospital', 'name': 'Hospital da U...",node,hospital,Hospital da UNIMED,Lagoa Nova
4,,501784871,-5.812943,-35.210726,,,{'amenity': 'school'},node,school,,Lagoa Nova


### Saving the datasets

In [0]:
neigh_df.to_csv('neighbourhoods,csv')

In [0]:
df.to_csv('amenities.csv')