In [1]:
import time
start = time.time()

import re
import numpy as np
import pandas as pd
from datetime import timedelta
from unicodedata import normalize, combining

from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent='luizhemelo')

from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

PATH = "../../dados_enem"

## Functions

In [2]:
# city location cache
cities = {
    'poxereu':[-15.8290891, -54.3925375, "mato grosso"],
    'brasilia':[-10.3333333, -53.2, "distrito federal"]
}
cols = ['NO_MUNICIPIO_PROVA']

def format_name(name: str):
    """
    Format name to remove punctuation and spaces
    """
    name = re.sub("([^a-zà-üA-ZÀ-Ü0-9])", " ", name)
    nfkd_str = normalize("NFKD", name)
    name = "".join(
        [c for _, c in enumerate(nfkd_str) if not combining(c)]
    ).lower()
    return name

def preprocess_dataset(df: pd.DataFrame, year: int):
    ctt = df['NO_MUNICIPIO_PROVA'].value_counts().index.tolist()
    occ = df['NO_MUNICIPIO_PROVA'].value_counts().values
    df = pd.DataFrame({'Ano':year, 'Num_Candidatos':occ, 'Cidade':ctt})
    
    latitude = []
    longitude = []
    states = []
    
    for i,row in df.iterrows():
        city = format_name(row['Cidade'])
        df.loc[i,'Cidade'] = city
        
        if city not in list(cities.keys()):
            location = geolocator.geocode(str(city+',Brazil'))
            match = geolocator.reverse(location.raw['lat']+","+location.raw['lon'])
            address = match.raw['address']
            state = format_name(address.get('state',''))
            
            cities[city] = [location.raw['lat'], location.raw['lon'], state]
            
            latitude.append(location.raw['lat'])
            longitude.append(location.raw['lon'])
            states.append(state)
        else:
            latitude.append(cities[city][0])
            longitude.append(cities[city][1])
            states.append(cities[city][2])
    
    df['Estado'] = states
    df['Latitude'] = latitude
    df['Longitude'] = longitude
    
    return df

## Data loading

In [3]:
# 01
enem13 = pd.read_csv(f"{PATH}/microdados_enem_2013/DADOS/MICRODADOS_ENEM_2013.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem13 = preprocess_dataset(enem13, 2013)
enem13

Unnamed: 0,Ano,Num_Candidatos,Cidade,Estado,Latitude,Longitude
0,2013,271845,sao paulo,sao paulo,-23.5506507,-46.6333824
1,2013,200306,rio de janeiro,rio de janeiro,-22.9110137,-43.2093727
2,2013,167533,fortaleza,ceara,-3.7304512,-38.5217989
3,2013,141551,salvador,bahia,-12.9822499,-38.4812772
4,2013,136953,belo horizonte,minas gerais,-19.9227318,-43.9450948
...,...,...,...,...,...,...
1608,2013,153,jutai,amazonas,-1.72014,-66.8877467
1609,2013,148,alvaraes,amazonas,-3.2155787,-64.8148327
1610,2013,146,bonfim,roraima,3.3613818,-59.8427241
1611,2013,121,carmesia,minas gerais,-19.0854535,-43.1392003


In [4]:
# 02
enem14 = pd.read_csv(f"{PATH}/microdados_enem_2014/DADOS/MICRODADOS_ENEM_2014.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem14 = preprocess_dataset(enem14, 2014)
enem14

Unnamed: 0,Ano,Num_Candidatos,Cidade,Estado,Latitude,Longitude
0,2014,324056,sao paulo,sao paulo,-23.5506507,-46.6333824
1,2014,242222,rio de janeiro,rio de janeiro,-22.9110137,-43.2093727
2,2014,188730,fortaleza,ceara,-3.7304512,-38.5217989
3,2014,180008,salvador,bahia,-12.9822499,-38.4812772
4,2014,164363,belo horizonte,minas gerais,-19.9227318,-43.9450948
...,...,...,...,...,...,...
1697,2014,174,general carneiro,parana,-26.422982,-51.3146691
1698,2014,171,carmesia,minas gerais,-19.0854535,-43.1392003
1699,2014,168,japura,amazonas,-7.1781505,-71.8538963
1700,2014,146,uarini,amazonas,-2.9827061,-65.1578615


In [5]:
# 03
enem15 = pd.read_csv(f"{PATH}/microdados_enem_2015/DADOS/MICRODADOS_ENEM_2015.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem15 = preprocess_dataset(enem15, 2015)
enem15

Unnamed: 0,Ano,Num_Candidatos,Cidade,Estado,Latitude,Longitude
0,2015,318507,sao paulo,sao paulo,-23.5506507,-46.6333824
1,2015,215303,rio de janeiro,rio de janeiro,-22.9110137,-43.2093727
2,2015,162206,brasilia,distrito federal,-10.333333,-53.2
3,2015,161869,fortaleza,ceara,-3.7304512,-38.5217989
4,2015,157824,salvador,bahia,-12.9822499,-38.4812772
...,...,...,...,...,...,...
1683,2015,117,japura,amazonas,-7.1781505,-71.8538963
1684,2015,113,canutama,amazonas,-6.5330019,-64.3834966
1685,2015,89,itamarati,amazonas,-6.4385189,-68.2437385
1686,2015,85,santa rosa do purus,acre,-9.4354872,-70.4926109


In [6]:
# 04
enem16 = pd.read_csv(f"{PATH}/microdados_enem_2016/DADOS/MICRODADOS_ENEM_2016.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem16 = preprocess_dataset(enem16, 2016)
enem16

Unnamed: 0,Ano,Num_Candidatos,Cidade,Estado,Latitude,Longitude
0,2016,364747,sao paulo,sao paulo,-23.5506507,-46.6333824
1,2016,210604,rio de janeiro,rio de janeiro,-22.9110137,-43.2093727
2,2016,174685,fortaleza,ceara,-3.7304512,-38.5217989
3,2016,172706,salvador,bahia,-12.9822499,-38.4812772
4,2016,167826,brasilia,distrito federal,-10.333333,-53.2
...,...,...,...,...,...,...
1687,2016,153,uarini,amazonas,-2.9827061,-65.1578615
1688,2016,141,santa rosa do purus,acre,-9.4354872,-70.4926109
1689,2016,140,bonfim,roraima,3.3613818,-59.8427241
1690,2016,140,itamarati,amazonas,-6.4385189,-68.2437385


In [7]:
# 05
enem17 = pd.read_csv(f"{PATH}/microdados_enem_2017/DADOS/MICRODADOS_ENEM_2017.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem17 = preprocess_dataset(enem17, 2017)
enem17

Unnamed: 0,Ano,Num_Candidatos,Cidade,Estado,Latitude,Longitude
0,2017,302110,sao paulo,sao paulo,-23.5506507,-46.6333824
1,2017,171221,rio de janeiro,rio de janeiro,-22.9110137,-43.2093727
2,2017,125744,fortaleza,ceara,-3.7304512,-38.5217989
3,2017,125257,brasilia,distrito federal,-10.333333,-53.2
4,2017,120793,manaus,amazonas,-3.1316333,-59.9825041
...,...,...,...,...,...,...
1685,2017,137,canutama,amazonas,-6.5330019,-64.3834966
1686,2017,126,uarini,amazonas,-2.9827061,-65.1578615
1687,2017,111,itamarati,amazonas,-6.4385189,-68.2437385
1688,2017,94,santa rosa do purus,acre,-9.4354872,-70.4926109


In [8]:
# 06
enem18 = pd.read_csv(f"{PATH}/microdados_enem_2018/DADOS/MICRODADOS_ENEM_2018.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem18 = preprocess_dataset(enem18, 2018)
enem18

Unnamed: 0,Ano,Num_Candidatos,Cidade,Estado,Latitude,Longitude
0,2018,247581,sao paulo,sao paulo,-23.5506507,-46.6333824
1,2018,150866,rio de janeiro,rio de janeiro,-22.9110137,-43.2093727
2,2018,111056,fortaleza,ceara,-3.7304512,-38.5217989
3,2018,106304,brasilia,distrito federal,-10.333333,-53.2
4,2018,98780,belo horizonte,minas gerais,-19.9227318,-43.9450948
...,...,...,...,...,...,...
1685,2018,84,japura,amazonas,-7.1781505,-71.8538963
1686,2018,77,bonfim,roraima,3.3613818,-59.8427241
1687,2018,65,jordao,acre,-9.1905396,-71.9484803
1688,2018,54,santa rosa do purus,acre,-9.4354872,-70.4926109


In [9]:
# 07
enem19 = pd.read_csv(f"{PATH}/microdados_enem_2019/DADOS/MICRODADOS_ENEM_2019.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem19 = preprocess_dataset(enem19, 2019)
enem19

Unnamed: 0,Ano,Num_Candidatos,Cidade,Estado,Latitude,Longitude
0,2019,216016,sao paulo,sao paulo,-23.5506507,-46.6333824
1,2019,134778,rio de janeiro,rio de janeiro,-22.9110137,-43.2093727
2,2019,95849,brasilia,distrito federal,-10.333333,-53.2
3,2019,95089,fortaleza,ceara,-3.7304512,-38.5217989
4,2019,87894,salvador,bahia,-12.9822499,-38.4812772
...,...,...,...,...,...,...
1687,2019,90,maraa,amazonas,-1.8535579,-65.5897749
1688,2019,57,japura,amazonas,-7.1781505,-71.8538963
1689,2019,52,bonfim,roraima,3.3613818,-59.8427241
1690,2019,49,fernando de noronha,pernambuco,-3.8537498,-32.41980180663255


In [10]:
# 08
enem20 = pd.read_csv(f"{PATH}/microdados_enem_2020/DADOS/MICRODADOS_ENEM_2020.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem20 = preprocess_dataset(enem20, 2020)
enem20

Unnamed: 0,Ano,Num_Candidatos,Cidade,Estado,Latitude,Longitude
0,2020,250475,sao paulo,sao paulo,-23.5506507,-46.6333824
1,2020,156727,rio de janeiro,rio de janeiro,-22.9110137,-43.2093727
2,2020,116932,brasilia,distrito federal,-10.333333,-53.2
3,2020,114037,manaus,amazonas,-3.1316333,-59.9825041
4,2020,108508,fortaleza,ceara,-3.7304512,-38.5217989
...,...,...,...,...,...,...
1707,2020,129,japura,amazonas,-7.1781505,-71.8538963
1708,2020,122,bonfim,roraima,3.3613818,-59.8427241
1709,2020,115,jordao,acre,-9.1905396,-71.9484803
1710,2020,77,santa rosa do purus,acre,-9.4354872,-70.4926109


In [11]:
# 09
enem21 = pd.read_csv(f"{PATH}/microdados_enem_2021/DADOS/MICRODADOS_ENEM_2021.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem21 = preprocess_dataset(enem21, 2021)
enem21

Unnamed: 0,Ano,Num_Candidatos,Cidade,Estado,Latitude,Longitude
0,2021,141840,sao paulo,sao paulo,-23.5506507,-46.6333824
1,2021,102152,rio de janeiro,rio de janeiro,-22.9110137,-43.2093727
2,2021,72457,fortaleza,ceara,-3.7304512,-38.5217989
3,2021,67501,brasilia,distrito federal,-10.333333,-53.2
4,2021,58502,salvador,bahia,-12.9822499,-38.4812772
...,...,...,...,...,...,...
1707,2021,53,barra do turvo,sao paulo,-24.756421,-48.5056942
1708,2021,47,fernando de noronha,pernambuco,-3.8537498,-32.41980180663255
1709,2021,44,jordao,acre,-9.1905396,-71.9484803
1710,2021,43,bonfim,roraima,3.3613818,-59.8427241


In [12]:
# 10
enem22 = pd.read_csv(f"{PATH}/microdados_enem_2022/DADOS/MICRODADOS_ENEM_2022.csv", sep=';', encoding = "ISO-8859-1")[cols]
enem22 = preprocess_dataset(enem22, 2022)
enem22

Unnamed: 0,Ano,Num_Candidatos,Cidade,Estado,Latitude,Longitude
0,2022,147046,sao paulo,sao paulo,-23.5506507,-46.6333824
1,2022,105397,rio de janeiro,rio de janeiro,-22.9110137,-43.2093727
2,2022,71714,fortaleza,ceara,-3.7304512,-38.5217989
3,2022,65401,brasilia,distrito federal,-10.333333,-53.2
4,2022,59976,belem,para,-1.45056,-48.4682453
...,...,...,...,...,...,...
1707,2022,57,jordao,acre,-9.1905396,-71.9484803
1708,2022,53,brejinho de nazare,tocantins,-11.0138265,-48.564626
1709,2022,51,barra do turvo,sao paulo,-24.756421,-48.5056942
1710,2022,48,japura,amazonas,-7.1781505,-71.8538963


In [13]:
final = pd.concat([enem13,enem14,enem15,enem16,enem17,enem18,enem19,enem20,enem21,enem22]).reset_index(drop=True)
final[["Latitude", "Longitude"]] = final[["Latitude", "Longitude"]].apply(pd.to_numeric)
final.to_pickle(f"{PATH}/enem_data.pickle")
final.to_csv(f"{PATH}/enem_data.csv")
final

Unnamed: 0,Ano,Num_Candidatos,Cidade,Estado,Latitude,Longitude
0,2013,271845,sao paulo,sao paulo,-23.550651,-46.633382
1,2013,200306,rio de janeiro,rio de janeiro,-22.911014,-43.209373
2,2013,167533,fortaleza,ceara,-3.730451,-38.521799
3,2013,141551,salvador,bahia,-12.982250,-38.481277
4,2013,136953,belo horizonte,minas gerais,-19.922732,-43.945095
...,...,...,...,...,...,...
16898,2022,57,jordao,acre,-9.190540,-71.948480
16899,2022,53,brejinho de nazare,tocantins,-11.013827,-48.564626
16900,2022,51,barra do turvo,sao paulo,-24.756421,-48.505694
16901,2022,48,japura,amazonas,-7.178151,-71.853896


In [14]:
print("Elapsed time (h:mm:ss):",str(timedelta(seconds=(time.time() - start))))

Elapsed time (h:mm:ss): 0:51:12.714194
