Skip to content

Commit

Permalink
Script to fetch amendments dataset from SICONV related to issue okfn-…
Browse files Browse the repository at this point in the history
…brasil#67

- fetching emendas.csv and saving as data/amendments.xz
- translating columns names to english
- TODO: download/create columns documentation and fetch beneficiaries info in src/fetch_cnpj_info.py
  • Loading branch information
marcusrehm committed Dec 13, 2016
1 parent 7b49e59 commit aa7cb50
Show file tree
Hide file tree
Showing 2 changed files with 229 additions and 0 deletions.
185 changes: 185 additions & 0 deletions develop/2016-12-12-marcusrehm-amendments.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(80938, 10)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"data = pd.read_csv('../data/amendments.xz', dtype={'amendment_beneficiary': np.str})\n",
"data.shape"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>proposal_id</th>\n",
" <th>proponent_qualification</th>\n",
" <th>amendment_program_code</th>\n",
" <th>amendment_number</th>\n",
" <th>congressperson_name</th>\n",
" <th>amendment_beneficiary</th>\n",
" <th>tax_indicative</th>\n",
" <th>congressperson_type</th>\n",
" <th>amendment_proposal_tranfer_value</th>\n",
" <th>amendment_tranfer_value</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>588275</td>\n",
" <td>BENEFICIARIO_EMENDA_PARLAMENTAR</td>\n",
" <td>5100020120077</td>\n",
" <td>31250010</td>\n",
" <td>ANTONIO BALHMANN</td>\n",
" <td>7535446000160</td>\n",
" <td>NÃO</td>\n",
" <td>INDIVIDUAL</td>\n",
" <td>292500</td>\n",
" <td>292500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1066705</td>\n",
" <td>BENEFICIARIO_EMENDA_PARLAMENTAR</td>\n",
" <td>2200020160030</td>\n",
" <td>28650006</td>\n",
" <td>JOSE STEDILE</td>\n",
" <td>94436342000100</td>\n",
" <td>SIM</td>\n",
" <td>INDIVIDUAL</td>\n",
" <td>243750</td>\n",
" <td>243750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>490954</td>\n",
" <td>BENEFICIARIO_EMENDA_PARLAMENTAR</td>\n",
" <td>2200020110005</td>\n",
" <td>10560008</td>\n",
" <td>SERGIO SOUZA</td>\n",
" <td>75771279000106</td>\n",
" <td>NÃO</td>\n",
" <td>INDIVIDUAL</td>\n",
" <td>97500</td>\n",
" <td>97500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>646294</td>\n",
" <td>BENEFICIARIO_EMENDA_PARLAMENTAR</td>\n",
" <td>5600020120020</td>\n",
" <td>27650005</td>\n",
" <td>RENZO BRAZ</td>\n",
" <td>66230384000147</td>\n",
" <td>NÃO</td>\n",
" <td>INDIVIDUAL</td>\n",
" <td>265630</td>\n",
" <td>264245,4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>586235</td>\n",
" <td>BENEFICIARIO_EMENDA_PARLAMENTAR</td>\n",
" <td>5200020120071</td>\n",
" <td>26850008</td>\n",
" <td>PADRE TON</td>\n",
" <td>4092714000128</td>\n",
" <td>NÃO</td>\n",
" <td>INDIVIDUAL</td>\n",
" <td>250000</td>\n",
" <td>250000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" proposal_id proponent_qualification amendment_program_code \\\n",
"0 588275 BENEFICIARIO_EMENDA_PARLAMENTAR 5100020120077 \n",
"1 1066705 BENEFICIARIO_EMENDA_PARLAMENTAR 2200020160030 \n",
"2 490954 BENEFICIARIO_EMENDA_PARLAMENTAR 2200020110005 \n",
"3 646294 BENEFICIARIO_EMENDA_PARLAMENTAR 5600020120020 \n",
"4 586235 BENEFICIARIO_EMENDA_PARLAMENTAR 5200020120071 \n",
"\n",
" amendment_number congressperson_name amendment_beneficiary tax_indicative \\\n",
"0 31250010 ANTONIO BALHMANN 7535446000160 NÃO \n",
"1 28650006 JOSE STEDILE 94436342000100 SIM \n",
"2 10560008 SERGIO SOUZA 75771279000106 NÃO \n",
"3 27650005 RENZO BRAZ 66230384000147 NÃO \n",
"4 26850008 PADRE TON 4092714000128 NÃO \n",
"\n",
" congressperson_type amendment_proposal_tranfer_value amendment_tranfer_value \n",
"0 INDIVIDUAL 292500 292500 \n",
"1 INDIVIDUAL 243750 243750 \n",
"2 INDIVIDUAL 97500 97500 \n",
"3 INDIVIDUAL 265630 264245,4 \n",
"4 INDIVIDUAL 250000 250000 "
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataset.head()\n",
"\n",
"# is_cnpj = dataset['amendment_beneficiary'].str.len() == 14.\n",
"# cnpj_list = set(dataset.loc[is_cnpj, 'amendment_beneficiary'])\n",
"# print(cnpj_list)\n",
"# already_fetched = set(dataset['amendment_beneficiary'].str.replace(r'[./-]', ''))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
44 changes: 44 additions & 0 deletions src/fetch_amendments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
import zipfile
import pandas as pd
import numpy as np
from urllib.request import urlretrieve

def download_source():
datasets_urls = [
'http://portal.convenios.gov.br/images/docs/CGSIS/csv/siconv_emenda.csv.zip']
filenames = map(lambda url: url.split('/')[-1], datasets_urls)
datasets_names = [
'amendments.xz']

for url, filename, dataset_name in zip(datasets_urls, filenames, datasets_names):
filepath = 'data/%s' % filename
print('Downloading %s' % filename)
urlretrieve(url, filepath)
zip_ref = zipfile.ZipFile(filepath, 'r')
zip_ref.extractall('data')
zip_ref.close()

print('Renaming columns in: %s' % filepath.replace('.zip', ''))
data = pd.read_csv(filepath_or_buffer=filepath.replace('.zip', ''), sep=';')
data.rename(columns={
'ID_PROPOSTA':'proposal_id',
'QUALIF_PROPONENTE':'proponent_qualification',
'COD_PROGRAMA_EMENDA':'amendment_program_code',
'NR_EMENDA':'amendment_number',
'NOME_PARLAMENTAR':'congressperson_name',
'BENEFICIARIO_EMENDA':'amendment_beneficiary',
'IND_IMPOSITIVO':'tax_indicative',
'TIPO_PARLAMENTAR':'congressperson_type',
'VALOR_REPASSE_PROPOSTA_EMENDA':'amendment_proposal_tranfer_value',
'VALOR_REPASSE_EMENDA':'amendment_tranfer_value'
}, inplace=True)

print('Saving %s dataset' % dataset_name)
data.to_csv(path_or_buf='data/%s' % dataset_name, sep=',', compression='xz', index=False)

print('Removing temporary files')
os.remove(filepath)
os.remove(filepath.replace('.zip', ''))

download_source()

0 comments on commit aa7cb50

Please sign in to comment.