forked from okfn-brasil/serenata-de-amor
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Script to fetch amendments dataset from SICONV related to issue okfn-…
…brasil#67 - fetching emendas.csv and saving as data/amendments.xz - translating columns names to english - TODO: download/create columns documentation and fetch beneficiaries info in src/fetch_cnpj_info.py
- Loading branch information
1 parent
7b49e59
commit aa7cb50
Showing
2 changed files
with
229 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 12, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"(80938, 10)" | ||
] | ||
}, | ||
"execution_count": 12, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"\n", | ||
"data = pd.read_csv('../data/amendments.xz', dtype={'amendment_beneficiary': np.str})\n", | ||
"data.shape" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"metadata": { | ||
"collapsed": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>proposal_id</th>\n", | ||
" <th>proponent_qualification</th>\n", | ||
" <th>amendment_program_code</th>\n", | ||
" <th>amendment_number</th>\n", | ||
" <th>congressperson_name</th>\n", | ||
" <th>amendment_beneficiary</th>\n", | ||
" <th>tax_indicative</th>\n", | ||
" <th>congressperson_type</th>\n", | ||
" <th>amendment_proposal_tranfer_value</th>\n", | ||
" <th>amendment_tranfer_value</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>588275</td>\n", | ||
" <td>BENEFICIARIO_EMENDA_PARLAMENTAR</td>\n", | ||
" <td>5100020120077</td>\n", | ||
" <td>31250010</td>\n", | ||
" <td>ANTONIO BALHMANN</td>\n", | ||
" <td>7535446000160</td>\n", | ||
" <td>NÃO</td>\n", | ||
" <td>INDIVIDUAL</td>\n", | ||
" <td>292500</td>\n", | ||
" <td>292500</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>1066705</td>\n", | ||
" <td>BENEFICIARIO_EMENDA_PARLAMENTAR</td>\n", | ||
" <td>2200020160030</td>\n", | ||
" <td>28650006</td>\n", | ||
" <td>JOSE STEDILE</td>\n", | ||
" <td>94436342000100</td>\n", | ||
" <td>SIM</td>\n", | ||
" <td>INDIVIDUAL</td>\n", | ||
" <td>243750</td>\n", | ||
" <td>243750</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>2</th>\n", | ||
" <td>490954</td>\n", | ||
" <td>BENEFICIARIO_EMENDA_PARLAMENTAR</td>\n", | ||
" <td>2200020110005</td>\n", | ||
" <td>10560008</td>\n", | ||
" <td>SERGIO SOUZA</td>\n", | ||
" <td>75771279000106</td>\n", | ||
" <td>NÃO</td>\n", | ||
" <td>INDIVIDUAL</td>\n", | ||
" <td>97500</td>\n", | ||
" <td>97500</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>3</th>\n", | ||
" <td>646294</td>\n", | ||
" <td>BENEFICIARIO_EMENDA_PARLAMENTAR</td>\n", | ||
" <td>5600020120020</td>\n", | ||
" <td>27650005</td>\n", | ||
" <td>RENZO BRAZ</td>\n", | ||
" <td>66230384000147</td>\n", | ||
" <td>NÃO</td>\n", | ||
" <td>INDIVIDUAL</td>\n", | ||
" <td>265630</td>\n", | ||
" <td>264245,4</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>4</th>\n", | ||
" <td>586235</td>\n", | ||
" <td>BENEFICIARIO_EMENDA_PARLAMENTAR</td>\n", | ||
" <td>5200020120071</td>\n", | ||
" <td>26850008</td>\n", | ||
" <td>PADRE TON</td>\n", | ||
" <td>4092714000128</td>\n", | ||
" <td>NÃO</td>\n", | ||
" <td>INDIVIDUAL</td>\n", | ||
" <td>250000</td>\n", | ||
" <td>250000</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" proposal_id proponent_qualification amendment_program_code \\\n", | ||
"0 588275 BENEFICIARIO_EMENDA_PARLAMENTAR 5100020120077 \n", | ||
"1 1066705 BENEFICIARIO_EMENDA_PARLAMENTAR 2200020160030 \n", | ||
"2 490954 BENEFICIARIO_EMENDA_PARLAMENTAR 2200020110005 \n", | ||
"3 646294 BENEFICIARIO_EMENDA_PARLAMENTAR 5600020120020 \n", | ||
"4 586235 BENEFICIARIO_EMENDA_PARLAMENTAR 5200020120071 \n", | ||
"\n", | ||
" amendment_number congressperson_name amendment_beneficiary tax_indicative \\\n", | ||
"0 31250010 ANTONIO BALHMANN 7535446000160 NÃO \n", | ||
"1 28650006 JOSE STEDILE 94436342000100 SIM \n", | ||
"2 10560008 SERGIO SOUZA 75771279000106 NÃO \n", | ||
"3 27650005 RENZO BRAZ 66230384000147 NÃO \n", | ||
"4 26850008 PADRE TON 4092714000128 NÃO \n", | ||
"\n", | ||
" congressperson_type amendment_proposal_tranfer_value amendment_tranfer_value \n", | ||
"0 INDIVIDUAL 292500 292500 \n", | ||
"1 INDIVIDUAL 243750 243750 \n", | ||
"2 INDIVIDUAL 97500 97500 \n", | ||
"3 INDIVIDUAL 265630 264245,4 \n", | ||
"4 INDIVIDUAL 250000 250000 " | ||
] | ||
}, | ||
"execution_count": 13, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"dataset.head()\n", | ||
"\n", | ||
"# is_cnpj = dataset['amendment_beneficiary'].str.len() == 14.\n", | ||
"# cnpj_list = set(dataset.loc[is_cnpj, 'amendment_beneficiary'])\n", | ||
"# print(cnpj_list)\n", | ||
"# already_fetched = set(dataset['amendment_beneficiary'].str.replace(r'[./-]', ''))" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.5.2" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import os | ||
import zipfile | ||
import pandas as pd | ||
import numpy as np | ||
from urllib.request import urlretrieve | ||
|
||
def download_source(): | ||
datasets_urls = [ | ||
'http://portal.convenios.gov.br/images/docs/CGSIS/csv/siconv_emenda.csv.zip'] | ||
filenames = map(lambda url: url.split('/')[-1], datasets_urls) | ||
datasets_names = [ | ||
'amendments.xz'] | ||
|
||
for url, filename, dataset_name in zip(datasets_urls, filenames, datasets_names): | ||
filepath = 'data/%s' % filename | ||
print('Downloading %s' % filename) | ||
urlretrieve(url, filepath) | ||
zip_ref = zipfile.ZipFile(filepath, 'r') | ||
zip_ref.extractall('data') | ||
zip_ref.close() | ||
|
||
print('Renaming columns in: %s' % filepath.replace('.zip', '')) | ||
data = pd.read_csv(filepath_or_buffer=filepath.replace('.zip', ''), sep=';') | ||
data.rename(columns={ | ||
'ID_PROPOSTA':'proposal_id', | ||
'QUALIF_PROPONENTE':'proponent_qualification', | ||
'COD_PROGRAMA_EMENDA':'amendment_program_code', | ||
'NR_EMENDA':'amendment_number', | ||
'NOME_PARLAMENTAR':'congressperson_name', | ||
'BENEFICIARIO_EMENDA':'amendment_beneficiary', | ||
'IND_IMPOSITIVO':'tax_indicative', | ||
'TIPO_PARLAMENTAR':'congressperson_type', | ||
'VALOR_REPASSE_PROPOSTA_EMENDA':'amendment_proposal_tranfer_value', | ||
'VALOR_REPASSE_EMENDA':'amendment_tranfer_value' | ||
}, inplace=True) | ||
|
||
print('Saving %s dataset' % dataset_name) | ||
data.to_csv(path_or_buf='data/%s' % dataset_name, sep=',', compression='xz', index=False) | ||
|
||
print('Removing temporary files') | ||
os.remove(filepath) | ||
os.remove(filepath.replace('.zip', '')) | ||
|
||
download_source() |