In [1]:
import csv

NEIGHBORHOOD = "bairro"
CEP = "cep"
DISTRICT = "distrito"
LATITUDE = "latitude"
ADDRESS = "logradouro"
LONGITUDE = "longitude"
TOPIC = "assunto"
SERVICE = "servico"

def conv(val:str):
    val = val.strip()

    if len(val) == 0:
        return None
    
    return val


In [2]:
reader = csv.reader(open('./data/final/Cidadania e assistência social.csv', 'r', encoding='utf8'),  delimiter=';')

header = next(reader)

CEP_INDEX = header.index(CEP)
DISTRICT_INDEX = header.index(DISTRICT)
LATITUDE_INDEX = header.index(LATITUDE)
ADDRESS_INDEX = header.index(ADDRESS)
LONGITUDE_INDEX = header.index(LONGITUDE)
TOPIC_INDEX = header.index(TOPIC)
SERVICE_INDEX = header.index(SERVICE)

cep_set = set()
address_set = set()
position_set = set()

for row in reader:
    if (row[TOPIC_INDEX] != 'População ou pessoa em situação de rua'):
        continue

    cep = conv(row[CEP_INDEX])
    address = conv(row[ADDRESS_INDEX])
    
    latitude = conv(row[LATITUDE_INDEX])
    longitude = conv(row[LONGITUDE_INDEX])

    if cep is not None:
        cep_set.add(cep)
    elif address is not None:
        address_set.add(address)
    else:
        position_set.add((latitude, longitude))
    

In [3]:
import requests
from tqdm import tqdm
import csv
from threading import Thread

ceps = list(cep_set)
num_ceps_per_thread = len(ceps) // 8

threads = []

def fetch_cep_data(cep):
    response = requests.get(f"https://opencep.com/v1/{cep}.json")
    
    if response.status_code != 200:
        return None

    data = response.json()
    
    try:
        return (data['cep'], data['logradouro'], data['bairro'])
    except KeyError:
        return None    

addresses_csv = csv.writer(open('enderecos.csv', 'w+', encoding='utf8'))

addresses_csv.writerow(('cep', 'logradouro', 'bairro'))

progress_bar = tqdm(total=len(ceps))

def process_ceps(start, end):
    for cep in ceps[start:end]:
        cep_data = fetch_cep_data(cep)

        progress_bar.update()

        if cep_data is None:
            continue
            
        addresses_csv.writerow(cep_data)

  0%|          | 0/26013 [00:00<?, ?it/s]

In [4]:
for i in range(8):
    start_index = i * num_ceps_per_thread
    end_index = (i + 1) * num_ceps_per_thread if i != 7 else len(ceps)  

    thread = Thread(target=process_ceps, args=(start_index, end_index))
    
    threads.append(thread)
    
    thread.start()

for thread in threads:
    thread.join()

100%|█████████▉| 26011/26013 [05:26<00:00, 24.06it/s] 

100%|██████████| 26013/26013 [05:40<00:00, 24.06it/s]