-
Notifications
You must be signed in to change notification settings - Fork 12
/
2017-05-19-jtemporal-cities-transparency-portal.py
342 lines (206 loc) · 9.07 KB
/
2017-05-19-jtemporal-cities-transparency-portal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
# coding: utf-8
# # First Study on Brazilian Cities Transparency Portal
# In this dataset we have a population projection for each Brazilian city in the year of 2013.
#
#
# In[1]:
import pandas as pd
import numpy as np
# We first collected the data with population estimatives,
# we can use it later to do some comparisions or to use it later
cities = pd.read_excel('../data/Cidades - estimativa 2013.xlsx',
converters={'COD. UF': np.str, 'COD. MUNIC': np.str},
sheetname=None, header=0)
# In[2]:
data = pd.DataFrame()
for key in cities.keys():
data = pd.concat([data, cities[key]])
data = data.reset_index(drop=True)
data.shape
# We should see 5570 rows because that's the number of cities that IBGE says that Brazil have. The different amount of rows leads me to believe there are metadata from the `.xlsx` messing with our data
# ## Translating column names
# In[3]:
data.rename(columns={
'UF': 'state',
'COD. UF': 'state_id',
'COD. MUNIC': 'city_id',
'NOME DO MUNICÍPIO': 'city_name',
'POPULAÇÃO ESTIMADA': 'population_projection'
}, inplace=True)
data.head()
# ## Formating `city_id`
#
# Formatting `city_id` to conform with the ids displayed on the Brazilian cesus files
# In[4]:
data['city_id'] = data['city_id'].apply(lambda x: x.zfill(5))
# ## Checking out a `unique_id` for each city
# In[5]:
data[data['city_id'] == '00108']
# In[6]:
UNIQUE_IDS = data.loc[:,['state_id', 'city_id']]
for i in range(len(UNIQUE_IDS['state_id'])):
UNIQUE_IDS.loc[i,'ids'] = '{}{}'.format(UNIQUE_IDS.loc[i,'state_id'],
UNIQUE_IDS.loc[i,'city_id'])
UNIQUE_IDS.head()
# In[7]:
len(set(UNIQUE_IDS['ids']))
# In[8]:
UNIQUE_IDS.shape
# In[9]:
brazilian_states = {'RO': 'rondonia',
'AC': 'acre',
'AM': 'amazonas',
'RR': 'roraima',
'PA': 'para',
'AP': 'amapa',
'TO': 'tocantis',
'MA': 'maranhao',
'PI': 'piaui',
'CE': 'ceara',
'RN': 'rio_grande_do_norte',
'PB': 'paraiba',
'PE': 'pernambuco',
'AL': 'alagoas',
'SE': 'sergipe',
'BA': 'bahia',
'MG': 'mina_gerais',
'ES': 'epirito_santo',
'RJ': 'rio_de_janeiro',
'SP': 'sao_paulo',
'PR': 'parana',
'SC': 'santa_catarina',
'RS': 'rio_grande_do_sul',
'MS': 'mato_grosso_do_sul',
'MT': 'mato_grosso',
'GO': 'goias',
'DF': 'distrito_federal'}
census_link = "ftp.ibge.gov.br/Censos/Censo_Demografico_2010/resultados/total_populacao_{}.zip"
# ## Gathering cities with @cuducos Brazilian Cities script
#
# @cuducos had already made a script with all Brazilian Cities and its code and state associated, here in [this repository](https://github.com/cuducos/brazilian-cities).
#
# We checked and it is the best way to get the cities in the right way.
# In[10]:
from serenata_toolbox.datasets import fetch
fetch('2017-05-22-brazilian-cities.csv', '../data')
# In[11]:
brazilian_cities = pd.read_csv('../data/2017-05-22-brazilian-cities.csv')
brazilian_cities.head()
# In[12]:
brazilian_cities.shape
# ## Normalizing its form
#
# It is necessary to normalize all information in order to use it to our necessities, so we managed to:
# - Lowercase all states
# - Remove all acentuation and normalize cities names
# - And for our case we remove spaces to generate the pattern we want
# In[13]:
brazilian_cities['state'] = brazilian_cities['state'].apply(str.lower)
# In[14]:
import unicodedata
def normalize_string(string):
if isinstance(string, str):
nfkd_form = unicodedata.normalize('NFKD', string.lower())
return nfkd_form.encode('ASCII', 'ignore').decode('utf-8')
# In[15]:
brazilian_cities['normalized_name'] = brazilian_cities['name'].apply(lambda x: normalize_string(x))
brazilian_cities['normalized_name'] = brazilian_cities['normalized_name'].apply(lambda x: x.replace(' ', ''))
# In[16]:
brazilian_cities.head()
# ## Getting all cities that are part of Transparency Portal
#
# There are some cities that we already know that have a page with transparency and open data. The main objective here is to find how many cities have that.
#
# Pattern: `{city}-{state}.portaltp.com.br`
# In[17]:
portal_url = 'https://{}-{}.portaltp.com.br/'
brazilian_cities['transparency_portal_url'] = brazilian_cities.apply(lambda row: portal_url.format(
row['normalized_name'],
row['state']), axis=1)
brazilian_cities.head(20)
# (Getting all of the status code for each city might take a while so we added the prints only for feedback)
# In[18]:
import requests
def get_status(url):
try:
print(requests.head(url).status_code)
return requests.head(url).status_code
except requests.ConnectionError:
print(404)
return 404
# In[19]:
get_ipython().run_cell_magic('time', '', "colatina = brazilian_cities[brazilian_cities['code'] == 320150]['transparency_portal_url'].values[0]\nstatusOK = get_status(colatina)\n\nabaete = brazilian_cities[brazilian_cities['code'] == 310020]['transparency_portal_url'].values[0]\nstatusNOK = get_status(abaete)")
# In[20]:
br_cities = brazilian_cities.loc[:10,:].copy()
# In[21]:
get_ipython().run_cell_magic('time', '', "br_cities.loc[:,'status_code'] = br_cities.apply(lambda x: get_status(x['transparency_portal_url']), axis=1)")
# In[22]:
br_cities
# This will take too long considering we have 5570 cities to address.
#
# Let's try using [grequests](https://pypi.python.org/pypi/grequests).
#
# I know that we can find two different status code in the first 10 cities urls test. So let's use those 10 to test grequests ;)
# In[23]:
import grequests
rs = (grequests.get(u) for u in list(br_cities['transparency_portal_url']))
# In[24]:
def exception_handler(request, exception):
return 404
responses = grequests.map(rs, exception_handler=exception_handler)
# In[25]:
codes = [int(x) for x in br_cities['status_code'].values]
print(pd.unique(codes), pd.unique(responses))
# In[26]:
responses
# The result above got me wondering where were those 200 statuses code we've seen before. I tested the code on the command line and they are there. So a little reasearch and I found that apparently it is not possible to run async tasks easily on a jupyter notebook [ref](http://ipywidgets.readthedocs.io/en/latest/examples/Widget%20Asynchronous.html).
#
# With that in mind we decided to write a script that generates the infomartion we want: Open Data url for each brazilian city
# In[27]:
data = br_cities[br_cities['status_code'] == 404].copy().reset_index(drop=True)
data
# There are some cities that we already know that have a page with transparency and open data but the pattern is different from the one above.
#
# Second Pattern: `cm{city}-{state}.portaltp.com.br`
# In[28]:
portal_url = 'https://cm{}-{}.portaltp.com.br/'
data['transparency_portal_url'] = data.apply(lambda row: portal_url.format(
row['normalized_name'],
row['state']), axis=1)
data
# We still need to update the status code column
# In[29]:
get_ipython().run_cell_magic('time', '', "data.loc[:,'status_code'] = data.apply(lambda x: get_status(x['transparency_portal_url']), axis=1)")
# In[30]:
data
# In[31]:
# study purposes
data.loc[8, 'status_code'] = 200
data
# In[32]:
data.loc[data['status_code'] == 404, 'transparency_portal_url'] = None
data
# In[33]:
br_cities.loc[br_cities['status_code'] == 404, 'transparency_portal_url'] = None
br_cities
# In[34]:
unnecessary_columns = ['normalized_name', 'status_code']
br_cities = pd.merge(br_cities.drop(unnecessary_columns, axis=1),
data.drop(unnecessary_columns, axis=1),
on=['code', 'name', 'state'], how='left')
br_cities['transparency_portal_url'] = br_cities .apply(lambda row: row['transparency_portal_url_x'] or row['transparency_portal_url_y'], axis=1)
unnecessary_columns = ['transparency_portal_url_x', 'transparency_portal_url_y']
br_cities = br_cities.drop(unnecessary_columns, axis=1)
br_cities
# # Conclusions
#
# After all that study, we find that in that pattern of transparency portals list there are already 279 cities, from them 19 are returning an Internal Server Error (Status Code: 5XX).
#
# It is something like 5% of all Brazilian existing cities!
#
# Below we have a table with all those cities with portals ;)
# In[35]:
with_tp_portal = pd.read_csv('../data/2017-05-30-cities_with_tp_portal.csv')
with_tp_portal.shape
# In[36]:
with_tp_portal