In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns



In [2]:
data = pd.read_csv('../data/2016-08-08-last-year.xz',
                   parse_dates=[16],
                   dtype={'document_id': np.str,
                          'congressperson_id': np.str,
                          'congressperson_document': np.str,
                          'term_id': np.str,
                          'cnpj_cpf': np.str,
                          'reimbursement_number': np.str})

### Reaching for subquota description

In [7]:
list(data.columns.values)

['document_id',
 'congressperson_name',
 'congressperson_id',
 'congressperson_document',
 'term',
 'state',
 'party',
 'term_id',
 'subquota_number',
 'subquota_description',
 'subquota_group_id',
 'subquota_group_description',
 'supplier',
 'cnpj_cpf',
 'document_number',
 'document_type',
 'issue_date',
 'document_value',
 'remark_value',
 'net_value',
 'month',
 'year',
 'installment',
 'passenger',
 'leg_of_the_trip',
 'batch_number',
 'reimbursement_number',
 'reimbursement_value',
 'applicant_id']

In [11]:
subquota_list = data['subquota_description'].unique()
print (subquota_list)

['Maintenance of office supporting parliamentary activity'
 'Fuels and lubricants' 'Consultancy, research and technical work'
 'Publicity of parliamentary activity'
 'Security service provided by specialized company' 'Flight tickets'
 'Telecommunication' 'Postal services' 'Congressperson meal'
 'Automotive vehicle renting or charter' 'Watercraft renting or charter'
 'Taxi, toll and parking' 'Flight ticket issue'
 'Lodging, except for congressperson from Distrito Federal'
 'Aircraft renting or charter of aircraft'
 'Terrestrial, maritime and fluvial tickets' 'Publication subscriptions'
 'Participation in course, talk or similar event']


In [13]:
len(subquota_list)

18

### End of subquota listings - WIP

### Checking net values from all the receipts

In [14]:
data.net_value.describe()

count    374484.000000
mean        570.566565
std        1993.167639
min       -9240.770000
25%          45.000000
50%         134.310000
75%         481.000000
max      189600.000000
Name: net_value, dtype: float64

In [15]:
grouped = data.groupby('cnpj_cpf', as_index=False)

print('{} total cnpj/cpfs, {} are unique'.format(len(data), len(grouped)))

374484 total cnpj/cpfs, 24530 are unique


### Creating a dataframe with the first supplier name for each cnpj_cpf:


In [16]:
cnpj_cpfs = []
names = []
for group in grouped:
    cnpj_cpfs.append(group[0])
    names.append(group[1].iloc[0].supplier)

names = pd.DataFrame({'cnpj_cpf': cnpj_cpfs, 'supplier_name': names})
names.head()



Unnamed: 0,cnpj_cpf,supplier_name
0,1172000180,DIÁRIOS ASSOCIADOS
1,1388000226,DISTRIBUIDORA BRASILIA DE VEICULOS S/A
2,1388000307,DISTRIBUIDORA BRASÍLIA DE VEICULOS S/A
3,1388000579,DISTRIBUIDORA BRASÍLIA DE VEÍCULOS S/A
4,1974000190,POSTO 81 LTDA


### CNPJs/CPFs that received most payments 

In [20]:
spent = grouped.agg({'net_value': np.nansum}).sort_values(by='net_value', ascending=False)

spent = pd.merge(spent, names, on='cnpj_cpf')
spent.head(10)

Unnamed: 0,cnpj_cpf,net_value,supplier_name
0,2012862000160,23639959.33,TAM LINHAS AÉREAS.
1,7575651000159,15170377.15,Cia Aérea - GOL
2,9296295000160,5763747.3,Cia Aérea - AZUL
3,2575829000148,4991633.14,Cia Aérea - AVIANCA
4,2558157000162,3365216.83,Telefonica Brasil S.A - VIVO
5,22005529000130,962800.01,DOUGLAS CUNHA DA SILVA ME
6,15193908000136,788299.96,JOSELY FERNANDA DO NASCIMENTO
7,17589509000114,679350.0,FATIMA FERREIRA DOS SANTOS ME
8,13230334000101,655780.0,InCine Video Ltda - ME
9,512777000135,513128.8,Cia Aérea - PASSAREDO


# Stopying now - starting investigation for each micro-enterprise(ME) listed