# Getting federal servers list

In [1]:
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from threading import Thread
base = 'csv/'

In [2]:
def requestHTMLPage(url):
    r = requests.get(url)
    while r.status_code != 200:
        time.sleep(10)
        r = requests.get(url)
        print("Loop em potencial")
    return r.content

In [3]:
# url base of portal da transparencia (where there is the list of functionaries)
urlBaseSite = "http://www.portaldatransparencia.gov.br/"
urlBaseOrgaoExercicio = "servidores/OrgaoExercicio-ListaServidores.asp?CodOrg=26243&Pagina="
currentMonth = 4

In [4]:
content = requestHTMLPage(urlBaseSite + urlBaseOrgaoExercicio)
soup = BeautifulSoup(content, "html.parser")
pages = soup.find_all("div", id = "paginacao")

In [5]:
# get the greater page number of functionaries
maxPage = pages[0].select('a')[1].get('onclick')
maxPage = int(maxPage[maxPage.find("(")+1:maxPage.find(")")])
maxPage

414

In [6]:
def getName(sumarry):
    sumarry = sumarry.find_all('td')
    iterDetails = iter(sumarry)
    while True:
        try:
            currentDetail = next(iterDetails).getText()
            if 'Nome:' in currentDetail:
                name = next(iterDetails).getText().strip()
        except StopIteration:
            break
    return name

def getMoneyValue(soup):
    basicRemuneration = soup.find_all('td')
    iterDetails = iter(basicRemuneration)
    money = ""
    while True:
        try:
            currentDetail = next(iterDetails).getText()
            if 'Remuneração básica bruta' in currentDetail:
                money = next(iterDetails).getText().strip()
        except StopIteration:
            break
    return money

def getMoneyInfo(moneyUrl):
    money = "-1000"
    for i in range(1, currentMonth+1):
        moneyPage = requestHTMLPage(urlBaseSite + moneyUrl + "&Ano=2017&Mes=" + str(i))
        if 'Servidor sem ficha' in str(moneyPage):
            continue
        else:
            soup = BeautifulSoup(moneyPage, "html.parser")
            money = getMoneyValue(soup)
            break
    return money

def getSumarryInfo(soup):
    sumarry = soup.find_all('div', id='resumo')[0]
    moneyUrl = sumarry.find_all('a')[0].get('href')
    name = getName(sumarry)
    money = getMoneyInfo(moneyUrl)
    return name, money

# scraping post, class_, level, org (departament where funcionary works)
# soup : page to be scraped 
# returns : tuple with post, class, level, org 
def getGeneralInfo(soup):
    details = soup.find_all('td')
    iterDetails = iter(details)
    post = ""
    class_ = ""
    level = ""
    org = ""
    while True:
        try:
            currentDetail = next(iterDetails).getText()
            if 'Cargo Emprego:' in currentDetail:
                post = next(iterDetails).getText().strip()
            elif 'Classe:' in currentDetail:
                class_ = next(iterDetails).getText().strip()
            elif 'Nível:' in currentDetail:
                level = next(iterDetails).getText().strip()
            elif 'UORG:' in currentDetail:
                org = next(iterDetails).getText().strip()
        except StopIteration:
            break
    return post, class_, level, org

# functionaryUrl : url page of functionary 
# returns : tuple with name, salary (called money), post, class, level and org (departament)
def getFunctionaryInfo(functionaryUrl, infos):
    info = requestHTMLPage(urlBaseSite + 'servidores/' + functionaryUrl)
    soup = BeautifulSoup(info, "html.parser")
    name, money = getSumarryInfo(soup)
    post, class_, level, org = getGeneralInfo(soup)
    infos.append((name, money, post, class_, level, org))

In [7]:
# functionaries of a given page
def getFunctionaryUrls(pageNumber):
    content = requestHTMLPage(urlBaseSite + urlBaseOrgaoExercicio + str(pageNumber))
    soup = BeautifulSoup(content, "html.parser")
    functionaries = soup.find_all('div', id='listagem')
    functionariesList = []
    for l in functionaries:
        table = l.find_all('tr')
        table = table[1:]
        for functionary in table:
            functionariesList.append(functionary.find_all('a')[0].get('href'))
    return functionariesList

In [8]:
# save stuff
def toString(name, money, post, class_, level, org):
    return name + ';' + money + ';' + post + ';' + class_ + ';' + level + ';' + org + '\n'

def saveCSV(infos):
    file = open(base + 'servers.csv', 'w')
    file.write("name;money;post;class;level;org\n")
    for i in infos:
        file.write(toString(*i))
    file.close()

In [13]:
# using threads to get functionary info faster
def callGetFunctionaryInfo(urls, infos):
    threads = list()
    for url in urls:
        thread = Thread(target=getFunctionaryInfo, args=(url, infos))
        threads.append(thread)
    for t in threads:
        t.start()
    for t in threads:
        t.join()

maxLen = 30
infos = []
urls = []
for i in range(0, maxPage): # for each page it gets all functionaries 
    urls += getFunctionaryUrls(i + 1)
    if len(urls) >= maxLen:
        callGetFunctionaryInfo(urls, infos)
        urls = []
        print("Current page " + str(i + 1))
if len(urls) > 0:
    callGetFunctionaryInfo(urls, infos)
print("Finished")
saveCSV(infos)


