In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns





In [2]:
data = pd.read_csv('../data/2016-11-19-reimbursements.xz',
               parse_dates=[16],
               dtype={'document_id': np.str,
                      'congressperson_id': np.str,
                      'congressperson_document': np.str,
                      'term_id': np.str,
                      'cnpj_cpf': np.str,
                      'reimbursement_number': np.str})

  interactivity=interactivity, compiler=compiler, result=result)


In [75]:
data.loc[0]

year                                                       2009
applicant_id                                               1001
document_id                                             1564212
reimbursement_value_total                                   NaN
total_net_value                                             130
reimbursement_numbers                                      2888
congressperson_name                            DILCEU SPERAFICO
congressperson_id                                         73768
congressperson_document                                     444
term                                                       2015
state                                                        PR
party                                                        PP
term_id                                                      55
subquota_number                                               3
subquota_description                       Fuels and lubricants
subquota_group_id                       

In [3]:
subquota_list = data['subquota_description'].unique()

In [4]:
len(subquota_list)
print (subquota_list.item(4))

Publicity of parliamentary activity


### Checking net values from all the receipts

In [6]:
data.total_net_value.describe()

count    1.532491e+06
mean     6.722007e+02
std      2.420299e+03
min      1.000000e-02
25%      4.856000e+01
50%      1.029300e+02
75%      2.600000e+02
max      2.150000e+05
Name: total_net_value, dtype: float64

In [7]:
grouped = data.groupby('cnpj_cpf', as_index=False)

print('{} total cnpj/cpfs, {} are unique'.format(len(data), len(grouped)))

1532491 total cnpj/cpfs, 81304 are unique


### Creating a dataframe with the first supplier name for each cnpj_cpf:


In [8]:
cnpj_cpfs = []
names = []
for group in grouped:
    cnpj_cpfs.append(group[0])
    names.append(group[1].iloc[0].supplier)

names = pd.DataFrame({'cnpj_cpf': cnpj_cpfs, 'supplier_name': names})
names.head()



Unnamed: 0,cnpj_cpf,supplier_name
0,0,TAM LINHAS AREAS S/A
1,0,PREFEITURA MUNICIPAL DE FORTALEZA
2,42000122,AUTO POSTO GASOL LTDA
3,42000203,AUTO POSTO GASOL
4,42000394,AUTO POSTO GASOL LTDA


## CNPJs/CPFs that received most payments 

In [9]:
spent = grouped.agg({'total_net_value': np.nansum}).sort_values(by='total_net_value', ascending=False)

spent = pd.merge(spent, names, on='cnpj_cpf')
spent.head(10)

Unnamed: 0,cnpj_cpf,total_net_value,supplier_name
0,2558157000162,15973718.68,TELEFONICA
1,4206050005140,7947819.33,TIM CELULAR
2,33000118000179,4689511.81,TELEMAR NORTE LESTE S/A
3,66970229000167,4289580.65,NEXTEL TELECOMUNICAÇÕES LTDA
4,2012862000160,3238907.6,TAM
5,11842749000100,2779381.99,ARL BARROS SERVIÇOS EXECUTIVOS RENT A CAR - ME
6,76535764000143,2767959.6,BRASIL TELECOM - OI
7,2449992000164,2756811.98,VIVO
8,22005529000130,2697800.34,DOUGLAS CUNHA DA SILVA ME
9,4114985000137,2497489.92,TOP LINE TÁXI AÉREO


#### CNPJ/CPFs that received most payments divided per subquota

In [10]:
subquota = dict()
sub_spent = dict()
sub_visit = dict()
for x in range(0, 18):
    foo = data[data.subquota_description == subquota_list.item(x) ]
    grouped = foo.groupby('cnpj_cpf', as_index=False)
    print(subquota_list.item(x) + ' have ' + '{} total cnpj/cpfs, {} are unique'.format(len(foo), len(grouped)))

    cnpj_cpfs = []
    names = []
    for group in grouped:
        cnpj_cpfs.append(group[0])
        names.append(group[1].iloc[0].supplier)

    names = pd.DataFrame({'cnpj_cpf': cnpj_cpfs, 'supplier_name': names})
    subquota[x] = names.head(10)
    #listing the ones with most spent amount of money
    spent = grouped.agg({'total_net_value': np.nansum}).sort_values(by='total_net_value', ascending=False)
    spent = pd.merge(spent, names, on='cnpj_cpf')
    sub_spent[x] = spent.head(10)
    #show the list with enterprises who received most number of visits
    visits = grouped['cnpj_cpf'].agg({'visits': len}).sort_values(by='visits', ascending=False)
    visits = pd.merge(visits, names, on='cnpj_cpf')
    sub_visit[x] = visits.head(10)


    

Fuels and lubricants have 569729 total cnpj/cpfs, 27385 are unique
Locomotion, meal and lodging have 15818 total cnpj/cpfs, 3705 are unique
Maintenance of office supporting parliamentary activity have 163470 total cnpj/cpfs, 7608 are unique
Software purchase or renting; Postal services; Subscriptions have 2635 total cnpj/cpfs, 589 are unique
Publicity of parliamentary activity have 61226 total cnpj/cpfs, 8945 are unique
Purchase of office supplies have 1601 total cnpj/cpfs, 619 are unique
Security service provided by specialized company have 6738 total cnpj/cpfs, 375 are unique
Flight tickets have 17063 total cnpj/cpfs, 588 are unique
Congressperson meal have 191724 total cnpj/cpfs, 24878 are unique
Telecommunication have 176502 total cnpj/cpfs, 629 are unique
Lodging, except for congressperson from Distrito Federal have 43812 total cnpj/cpfs, 6691 are unique
Automotive vehicle renting or watercraft charter have 145809 total cnpj/cpfs, 3693 are unique
Postal services have 20921 total c

# Dictionary for subquota

In [11]:
for x in range(0,18):
    # print (x + ' = ' + subquota_list.item(x))
    print ( '{} for : '.format(x) + subquota_list.item(x))
print ('search using "subquota[your selected number]"')

0 for : Fuels and lubricants
1 for : Locomotion, meal and lodging
2 for : Maintenance of office supporting parliamentary activity
3 for : Software purchase or renting; Postal services; Subscriptions
4 for : Publicity of parliamentary activity
5 for : Purchase of office supplies
6 for : Security service provided by specialized company
7 for : Flight tickets
8 for : Congressperson meal
9 for : Telecommunication
10 for : Lodging, except for congressperson from Distrito Federal
11 for : Automotive vehicle renting or watercraft charter
12 for : Postal services
13 for : Consultancy, research and technical work
14 for : Publication subscriptions
15 for : Taxi, toll and parking
16 for : Automotive vehicle renting or charter
17 for : Terrestrial, maritime and fluvial tickets
search using "subquota[your selected number]"


### Use the cell below to search and understand each subquota

In [12]:
#function to return all the info
def subquota_info(x):
    #return sub_visit[x], sub_spent[x]
    from IPython.display import display
    display(sub_visit[x])
    display(sub_spent[x])

In [122]:
subquota_info(12)

Unnamed: 0,cnpj_cpf,visits,supplier_name
0,66354457000102,521,ECT
1,2012862000160,457,TAM
2,86641438000174,419,ECT EMP. BRAS. CORREIOS E TELÉGRAFOS
3,583615000198,316,CORREIOS
4,42947184000141,269,ECT
5,34028316291329,269,ECT
6,34028316450575,257,ECT.EMP.BRAS.DE CORREIOS E TELEGRÁFOS
7,1611770000133,249,ECT - EMP. BRAS. CORREIOS E TELEGRAFOS
8,11567785000103,249,Emp. Bras. de Correios e Telegrafos
9,467855000127,245,ECT - EMPRESA BRAS. DE CORREIOS E TELEG.


Unnamed: 0,cnpj_cpf,total_net_value,supplier_name
0,34028316577052,630517.17,ECT
1,66354457000102,395789.52,ECT
2,42943753000180,256700.99,CORREIOS ACF NIQUELINA LTDA
3,35905090000144,211529.57,DIREMADI MARKETING E SERVIÇOS LTDA
4,85309664000190,211348.05,ECT - CORREIOS E TELEGRAFOS
5,1670180000181,186500.0,A.M CAVALCANTI
6,11835000147,174498.85,ECT- EMPRES. BRAS. DE CORREIOS E TELÉGRAFOS
7,42775460000131,133226.39,CORREIOS
8,2605360000142,132949.63,ECT
9,38052841000115,129380.92,JRTV FRANQUIA POSTAL LTDA


In [101]:


def most_expensive_from_cnpj(cnpj):
    foobar = data[data['cnpj_cpf'] == '{}'.format(cnpj)]
    foobar = foobar.sort_values(by='total_net_value', ascending=False)
    return foobar.head(10)

In [120]:
most_expensive_from_cnpj('76755404000157')

Unnamed: 0,year,applicant_id,document_id,reimbursement_value_total,total_net_value,reimbursement_numbers,congressperson_name,congressperson_id,congressperson_document,term,...,issue_date,document_value,remark_value,net_values,month,installment,passenger,leg_of_the_trip,batch_number,reimbursement_values
387644,2011,1133,2120717,,6300.0,3695,NELSON MEURER,73781,458,2015.0,...,2011-07-01T00:00:00,6300.0,0.0,6300.0,6,0,,,549919,
387652,2011,1133,2126390,,6090.0,3705,NELSON MEURER,73781,458,2015.0,...,2011-08-01T00:00:00,6090.0,0.0,6090.0,7,0,,,555396,
387563,2011,1133,2089141,,6080.0,3665,NELSON MEURER,73781,458,2015.0,...,2011-06-01T00:00:00,6080.0,0.0,6080.0,5,0,,,543789,
387549,2011,1133,2088556,,5320.0,3665,NELSON MEURER,73781,458,2015.0,...,2011-05-02T00:00:00,5320.0,0.0,5320.0,4,0,,,543797,
387764,2011,1133,2152927,,5250.0,3729,NELSON MEURER,73781,458,2015.0,...,2011-09-01T00:00:00,5250.0,0.0,5250.0,8,0,,,562425,
387322,2011,1133,2017511,,4940.0,3579,NELSON MEURER,73781,458,2015.0,...,2011-03-01T00:00:00,4940.0,0.0,4940.0,2,0,,,522237,
387406,2011,1133,2040005,,4940.0,3612,NELSON MEURER,73781,458,2015.0,...,2011-04-04T00:00:00,4940.0,0.0,4940.0,3,0,,,529108,
185881,2010,1133,1995811,,4750.0,3530,NELSON MEURER,73781,458,2015.0,...,2011-01-05T00:00:00,4750.0,0.0,4750.0,12,0,,,515948,
185809,2010,1133,1971465,,4370.0,3481,NELSON MEURER,73781,458,2015.0,...,2010-12-01T00:00:00,4370.0,0.0,4370.0,11,0,,,508270,
185783,2010,1133,1956139,,4180.0,3465,NELSON MEURER,73781,458,2015.0,...,2010-11-02T00:00:00,4180.0,0.0,4180.0,10,0,,,501187,


In [121]:
data.loc[387644]

year                                                                       2011
applicant_id                                                               1133
document_id                                                             2120717
reimbursement_value_total                                                   NaN
total_net_value                                                            6300
reimbursement_numbers                                                      3695
congressperson_name                                               NELSON MEURER
congressperson_id                                                         73781
congressperson_document                                                     458
term                                                                       2015
state                                                                        PR
party                                                                        PP
term_id                                 