In [1]:
import time
import numpy as np
import pandas as pd
from datetime import timedelta

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='luizhemelo')

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

start = time.time()

In [2]:
# city location cache
cities = {'Poxeréu':[-15.8290891, -54.3925375]}

cols = ['NO_MUNICIPIO_PROVA']

def preprocess_dataset(df: pd.DataFrame, year: int):
    ctt = df['NO_MUNICIPIO_PROVA'].value_counts().index.tolist()
    occ = df['NO_MUNICIPIO_PROVA'].value_counts().values
    df = pd.DataFrame({'Ano':year, 'Cidade':ctt, 'Num_Candidatos':occ})
    
    latitude = []
    longitude = []
    
    for _,row in df.iterrows():
        if row['Cidade'] not in list(cities.keys()):
            location = geolocator.geocode(str(row['Cidade']+', Brazil'))
            cities[row['Cidade']] = [location.raw['lat'], location.raw['lon']]
            latitude.append(location.raw['lat'])
            longitude.append(location.raw['lon'])
        else:
            latitude.append(cities[row['Cidade']][0])
            longitude.append(cities[row['Cidade']][1])
        
    df['Latitude'] = latitude
    df['Longitude'] = longitude
    
    return df

In [3]:
# 01
enem13 = pd.read_csv("../../dados_enem/microdados_enem_2013/DADOS/MICRODADOS_ENEM_2013.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem13 = preprocess_dataset(enem13, 2013)
enem13

Unnamed: 0,Ano,Cidade,Num_Candidatos,Latitude,Longitude
0,2013,SAO PAULO,271845,-23.5506507,-46.6333824
1,2013,RIO DE JANEIRO,200306,-22.9110137,-43.2093727
2,2013,FORTALEZA,167533,-3.7304512,-38.5217989
3,2013,SALVADOR,141551,-12.9822499,-38.4812772
4,2013,BELO HORIZONTE,136953,-19.9227318,-43.9450948
...,...,...,...,...,...
1608,2013,JUTAI,153,-2.7471478,-66.7721255
1609,2013,ALVARAES,148,-3.2155787,-64.8148327
1610,2013,BONFIM,146,3.3613818,-59.8427241
1611,2013,CARMESIA,121,-19.0854535,-43.1392003


In [4]:
# 02
enem14 = pd.read_csv("../../dados_enem/microdados_enem_2014/DADOS/MICRODADOS_ENEM_2014.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem14 = preprocess_dataset(enem14, 2014)
enem14

Unnamed: 0,Ano,Cidade,Num_Candidatos,Latitude,Longitude
0,2014,SAO PAULO,324056,-23.5506507,-46.6333824
1,2014,RIO DE JANEIRO,242222,-22.9110137,-43.2093727
2,2014,FORTALEZA,188730,-3.7304512,-38.5217989
3,2014,SALVADOR,180008,-12.9822499,-38.4812772
4,2014,BELO HORIZONTE,164363,-19.9227318,-43.9450948
...,...,...,...,...,...
1697,2014,GENERAL CARNEIRO,174,-26.422982,-51.3146691
1698,2014,CARMESIA,171,-19.0854535,-43.1392003
1699,2014,JAPURA,168,-23.469273,-52.55572
1700,2014,UARINI,146,-2.9827061,-65.1578615


In [5]:
# 03
enem15 = pd.read_csv("../../dados_enem/microdados_enem_2015/DADOS/MICRODADOS_ENEM_2015.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem15 = preprocess_dataset(enem15, 2015)
enem15

Unnamed: 0,Ano,Cidade,Num_Candidatos,Latitude,Longitude
0,2015,São Paulo,318507,-23.5506507,-46.6333824
1,2015,Rio de Janeiro,215303,-22.9110137,-43.2093727
2,2015,Brasília,162206,-10.3333333,-53.2
3,2015,Fortaleza,161869,-3.7304512,-38.5217989
4,2015,Salvador,157824,-12.9822499,-38.4812772
...,...,...,...,...,...
1683,2015,Japurá,117,-23.469273,-52.55572
1684,2015,Canutama,113,-6.5330019,-64.3834966
1685,2015,Itamarati,89,-6.4385189,-68.2437385
1686,2015,Santa Rosa do Purus,85,-9.4354872,-70.4926109


In [6]:
print("Elapsed time (h:mm:ss):",str(timedelta(seconds=(time.time() - start))))

Elapsed time (h:mm:ss): 0:38:29.336500


In [7]:
# 04
enem16 = pd.read_csv("../../dados_enem/microdados_enem_2016/DADOS/MICRODADOS_ENEM_2016.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem16 = preprocess_dataset(enem16, 2016)
enem16

Unnamed: 0,Ano,Cidade,Num_Candidatos,Latitude,Longitude
0,2016,São Paulo,364747,-23.5506507,-46.6333824
1,2016,Rio de Janeiro,210604,-22.9110137,-43.2093727
2,2016,Fortaleza,174685,-3.7304512,-38.5217989
3,2016,Salvador,172706,-12.9822499,-38.4812772
4,2016,Brasília,167826,-10.3333333,-53.2
...,...,...,...,...,...
1687,2016,Uarini,153,-2.9827061,-65.1578615
1688,2016,Santa Rosa do Purus,141,-9.4354872,-70.4926109
1689,2016,Bonfim,140,3.3613818,-59.8427241
1690,2016,Itamarati,140,-6.4385189,-68.2437385


In [8]:
# 05
enem17 = pd.read_csv("../../dados_enem/microdados_enem_2017/DADOS/MICRODADOS_ENEM_2017.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem17 = preprocess_dataset(enem17, 2017)
enem17

Unnamed: 0,Ano,Cidade,Num_Candidatos,Latitude,Longitude
0,2017,São Paulo,302110,-23.5506507,-46.6333824
1,2017,Rio de Janeiro,171221,-22.9110137,-43.2093727
2,2017,Fortaleza,125744,-3.7304512,-38.5217989
3,2017,Brasília,125257,-10.3333333,-53.2
4,2017,Manaus,120793,-3.1316333,-59.9825041
...,...,...,...,...,...
1685,2017,Canutama,137,-6.5330019,-64.3834966
1686,2017,Uarini,126,-2.9827061,-65.1578615
1687,2017,Itamarati,111,-6.4385189,-68.2437385
1688,2017,Santa Rosa do Purus,94,-9.4354872,-70.4926109


In [9]:
# 06
enem18 = pd.read_csv("../../dados_enem/microdados_enem_2018/DADOS/MICRODADOS_ENEM_2018.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem18 = preprocess_dataset(enem18, 2018)
enem18

Unnamed: 0,Ano,Cidade,Num_Candidatos,Latitude,Longitude
0,2018,São Paulo,247581,-23.5506507,-46.6333824
1,2018,Rio de Janeiro,150866,-22.9110137,-43.2093727
2,2018,Fortaleza,111056,-3.7304512,-38.5217989
3,2018,Brasília,106304,-10.3333333,-53.2
4,2018,Belo Horizonte,98780,-19.9227318,-43.9450948
...,...,...,...,...,...
1685,2018,Japurá,84,-23.469273,-52.55572
1686,2018,Bonfim,77,3.3613818,-59.8427241
1687,2018,Jordão,65,-9.1905396,-71.9484803
1688,2018,Santa Rosa do Purus,54,-9.4354872,-70.4926109


In [10]:
# 07
enem19 = pd.read_csv("../../dados_enem/microdados_enem_2019/DADOS/MICRODADOS_ENEM_2019.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem19 = preprocess_dataset(enem19, 2019)
enem19

Unnamed: 0,Ano,Cidade,Num_Candidatos,Latitude,Longitude
0,2019,São Paulo,216016,-23.5506507,-46.6333824
1,2019,Rio de Janeiro,134778,-22.9110137,-43.2093727
2,2019,Brasília,95849,-10.3333333,-53.2
3,2019,Fortaleza,95089,-3.7304512,-38.5217989
4,2019,Salvador,87894,-12.9822499,-38.4812772
...,...,...,...,...,...
1687,2019,Maraã,90,-1.8535579,-65.5897749
1688,2019,Japurá,57,-23.469273,-52.55572
1689,2019,Bonfim,52,3.3613818,-59.8427241
1690,2019,Fernando de Noronha,49,-3.8545643,-32.37861623300567


In [11]:
# 08
enem20 = pd.read_csv("../../dados_enem/microdados_enem_2020/DADOS/MICRODADOS_ENEM_2020.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem20 = preprocess_dataset(enem20, 2020)
enem20

Unnamed: 0,Ano,Cidade,Num_Candidatos,Latitude,Longitude
0,2020,São Paulo,250475,-23.5506507,-46.6333824
1,2020,Rio de Janeiro,156727,-22.9110137,-43.2093727
2,2020,Brasília,116932,-10.3333333,-53.2
3,2020,Manaus,114037,-3.1316333,-59.9825041
4,2020,Fortaleza,108508,-3.7304512,-38.5217989
...,...,...,...,...,...
1707,2020,Japurá,129,-23.469273,-52.55572
1708,2020,Bonfim,122,3.3613818,-59.8427241
1709,2020,Jordão,115,-9.1905396,-71.9484803
1710,2020,Santa Rosa do Purus,77,-9.4354872,-70.4926109


In [12]:
# 09
enem21 = pd.read_csv("../../dados_enem/microdados_enem_2021/DADOS/MICRODADOS_ENEM_2021.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem21 = preprocess_dataset(enem21, 2021)
enem21

Unnamed: 0,Ano,Cidade,Num_Candidatos,Latitude,Longitude
0,2021,São Paulo,141840,-23.5506507,-46.6333824
1,2021,Rio de Janeiro,102152,-22.9110137,-43.2093727
2,2021,Fortaleza,72457,-3.7304512,-38.5217989
3,2021,Brasília,67501,-10.3333333,-53.2
4,2021,Salvador,58502,-12.9822499,-38.4812772
...,...,...,...,...,...
1707,2021,Barra do Turvo,53,-24.756421,-48.5056942
1708,2021,Fernando de Noronha,47,-3.8545643,-32.37861623300567
1709,2021,Jordão,44,-9.1905396,-71.9484803
1710,2021,Bonfim,43,3.3613818,-59.8427241


In [13]:
# 10
enem22 = pd.read_csv("../../dados_enem/microdados_enem_2022/DADOS/MICRODADOS_ENEM_2022.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem22 = preprocess_dataset(enem22, 2022)
enem22

Unnamed: 0,Ano,Cidade,Num_Candidatos,Latitude,Longitude
0,2022,São Paulo,147046,-23.5506507,-46.6333824
1,2022,Rio de Janeiro,105397,-22.9110137,-43.2093727
2,2022,Fortaleza,71714,-3.7304512,-38.5217989
3,2022,Brasília,65401,-10.3333333,-53.2
4,2022,Belém,59976,-1.45056,-48.4682453
...,...,...,...,...,...
1707,2022,Jordão,57,-9.1905396,-71.9484803
1708,2022,Brejinho de Nazaré,53,-11.0138265,-48.564626
1709,2022,Barra do Turvo,51,-24.756421,-48.5056942
1710,2022,Japurá,48,-23.469273,-52.55572


In [14]:
final = pd.concat([enem13,enem14,enem15,enem16,enem17,enem18,enem19,enem20,enem21,enem22]).reset_index(drop=True)
final[["Latitude", "Longitude"]] = final[["Latitude", "Longitude"]].apply(pd.to_numeric)
final.to_pickle("../data/vis01_enem_data.pickle")
final

Unnamed: 0,Ano,Cidade,Num_Candidatos,Latitude,Longitude
0,2013,SAO PAULO,271845,-23.550651,-46.633382
1,2013,RIO DE JANEIRO,200306,-22.911014,-43.209373
2,2013,FORTALEZA,167533,-3.730451,-38.521799
3,2013,SALVADOR,141551,-12.982250,-38.481277
4,2013,BELO HORIZONTE,136953,-19.922732,-43.945095
...,...,...,...,...,...
16898,2022,Jordão,57,-9.190540,-71.948480
16899,2022,Brejinho de Nazaré,53,-11.013827,-48.564626
16900,2022,Barra do Turvo,51,-24.756421,-48.505694
16901,2022,Japurá,48,-23.469273,-52.555720


In [15]:
print("Elapsed time (h:mm:ss):",str(timedelta(seconds=(time.time() - start))))

Elapsed time (h:mm:ss): 0:46:22.386909
