In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns



In [2]:
data = pd.read_csv('../data/2016-08-08-last-year.xz',
                   parse_dates=[16],
                   dtype={'document_id': np.str,
                          'congressperson_id': np.str,
                          'congressperson_document': np.str,
                          'term_id': np.str,
                          'cnpj_cpf': np.str,
                          'reimbursement_number': np.str})

In [3]:
subquota_list = data['subquota_description'].unique()

In [4]:
len(subquota_list)
print (subquota_list.item(4))

Security service provided by specialized company


### Checking net values from all the receipts

In [5]:
data.net_value.describe()

count    374484.000000
mean        570.566565
std        1993.167639
min       -9240.770000
25%          45.000000
50%         134.310000
75%         481.000000
max      189600.000000
Name: net_value, dtype: float64

In [6]:
grouped = data.groupby('cnpj_cpf', as_index=False)

print('{} total cnpj/cpfs, {} are unique'.format(len(data), len(grouped)))

374484 total cnpj/cpfs, 24530 are unique


### Creating a dataframe with the first supplier name for each cnpj_cpf:


In [7]:
cnpj_cpfs = []
names = []
for group in grouped:
    cnpj_cpfs.append(group[0])
    names.append(group[1].iloc[0].supplier)

names = pd.DataFrame({'cnpj_cpf': cnpj_cpfs, 'supplier_name': names})
names.head()



Unnamed: 0,cnpj_cpf,supplier_name
0,1172000180,DIÁRIOS ASSOCIADOS
1,1388000226,DISTRIBUIDORA BRASILIA DE VEICULOS S/A
2,1388000307,DISTRIBUIDORA BRASÍLIA DE VEICULOS S/A
3,1388000579,DISTRIBUIDORA BRASÍLIA DE VEÍCULOS S/A
4,1974000190,POSTO 81 LTDA


## CNPJs/CPFs that received most payments 

In [8]:
spent = grouped.agg({'net_value': np.nansum}).sort_values(by='net_value', ascending=False)

spent = pd.merge(spent, names, on='cnpj_cpf')
spent.head(10)

Unnamed: 0,cnpj_cpf,net_value,supplier_name
0,2012862000160,23639959.33,TAM LINHAS AÉREAS.
1,7575651000159,15170377.15,Cia Aérea - GOL
2,9296295000160,5763747.3,Cia Aérea - AZUL
3,2575829000148,4991633.14,Cia Aérea - AVIANCA
4,2558157000162,3365216.83,Telefonica Brasil S.A - VIVO
5,22005529000130,962800.01,DOUGLAS CUNHA DA SILVA ME
6,15193908000136,788299.96,JOSELY FERNANDA DO NASCIMENTO
7,17589509000114,679350.0,FATIMA FERREIRA DOS SANTOS ME
8,13230334000101,655780.0,InCine Video Ltda - ME
9,512777000135,513128.8,Cia Aérea - PASSAREDO


#### CNPJ/CPFs that received most payments divided per subquota

In [29]:
subquota = dict()
sub_spent = dict()
sub_visit = dict()
for x in range(0, 18):
    foo = data[data.subquota_description == subquota_list.item(x) ]
    grouped = foo.groupby('cnpj_cpf', as_index=False)
    print(subquota_list.item(x) + ' have ' + '{} total cnpj/cpfs, {} are unique'.format(len(foo), len(grouped)))

    cnpj_cpfs = []
    names = []
    for group in grouped:
        cnpj_cpfs.append(group[0])
        names.append(group[1].iloc[0].supplier)

    names = pd.DataFrame({'cnpj_cpf': cnpj_cpfs, 'supplier_name': names})
    subquota[x] = names.head(10)
    #listing the ones with most spent amount of money
    spent = grouped.agg({'net_value': np.nansum}).sort_values(by='net_value', ascending=False)
    spent = pd.merge(spent, names, on='cnpj_cpf')
    sub_spent[x] = spent.head(10)
    #show the list with enterprises who received most number of visits
    visits = grouped['cnpj_cpf'].agg({'visits': len}).sort_values(by='visits', ascending=False)
    visits = pd.merge(visits, names, on='cnpj_cpf')
    sub_visit[x] = visits.head(10)


    

Maintenance of office supporting parliamentary activity have 21519 total cnpj/cpfs, 2239 are unique
Fuels and lubricants have 74690 total cnpj/cpfs, 9762 are unique
Consultancy, research and technical work have 3096 total cnpj/cpfs, 711 are unique
Publicity of parliamentary activity have 10208 total cnpj/cpfs, 2565 are unique
Security service provided by specialized company have 852 total cnpj/cpfs, 113 are unique
Flight tickets have 3182 total cnpj/cpfs, 98 are unique
Telecommunication have 37241 total cnpj/cpfs, 185 are unique
Postal services have 31919 total cnpj/cpfs, 373 are unique
Congressperson meal have 33540 total cnpj/cpfs, 5926 are unique
Automotive vehicle renting or charter have 5850 total cnpj/cpfs, 552 are unique
Watercraft renting or charter have 41 total cnpj/cpfs, 24 are unique
Taxi, toll and parking have 28077 total cnpj/cpfs, 1082 are unique
Flight ticket issue have 116993 total cnpj/cpfs, 5 are unique
Lodging, except for congressperson from Distrito Federal have 53

# Dictionary for subquota

In [23]:
for x in range(0,18):
    # print (x + ' = ' + subquota_list.item(x))
    print ( '{} for : '.format(x) + subquota_list.item(x))
print ('search using "subquota[your selected number]"')

0 for : Maintenance of office supporting parliamentary activity
1 for : Fuels and lubricants
2 for : Consultancy, research and technical work
3 for : Publicity of parliamentary activity
4 for : Security service provided by specialized company
5 for : Flight tickets
6 for : Telecommunication
7 for : Postal services
8 for : Congressperson meal
9 for : Automotive vehicle renting or charter
10 for : Watercraft renting or charter
11 for : Taxi, toll and parking
12 for : Flight ticket issue
13 for : Lodging, except for congressperson from Distrito Federal
14 for : Aircraft renting or charter of aircraft
15 for : Terrestrial, maritime and fluvial tickets
16 for : Publication subscriptions
17 for : Participation in course, talk or similar event
search using "subquota[your selected number]"


### Use the cell below to search and understand each subquota

In [44]:
#function to return all the info
def subquota_info(x):
    #return sub_visit[x], sub_spent[x]
    from IPython.display import display
    display(sub_visit[x])
    display(sub_spent[x])

In [49]:
subquota_info(13)

Unnamed: 0,cnpj_cpf,visits,supplier_name
0,762199000276,77,RCD EMPREENDIMENTOS LTDA
1,18287639000165,76,FLOR DO CERRADO HOSPEDAGEM
2,8928877000159,72,IBIS FLORIANÓPOLIS FLEX HOTELARIA
3,26418749000147,54,PAULO OCTAVIO HOTEIS E TURISMO LTDA
4,75817999000165,52,HOTEL BRÜGGEMANN DE TURISMO LTDA
5,76755404000157,47,PARANOA HOTEIS LTDA
6,4356463000141,35,BRISTOL HOTEL LTDA
7,366518000143,33,IRFATUR TUR E HOTEL SA ARACOARA
8,10532251000170,32,BMF HOTEL E TURISMO LTDA
9,2924895000186,30,ECONOTEL


Unnamed: 0,cnpj_cpf,net_value,supplier_name
0,76755404000157,66114.15,PARANOA HOTEIS LTDA
1,7721300000109,40529.59,LUZEIROS HOTÉIS S/A
2,16978175000108,34840.82,Adria viagens e turismo LTDA ME
3,26418749000147,33612.95,PAULO OCTAVIO HOTEIS E TURISMO LTDA
4,1982156000188,31582.0,PAIAGUAS HOTÉIS S/A
5,762199000276,31494.94,RCD EMPREENDIMENTOS LTDA
6,4443021000213,26650.0,EUROPA INVESTIMENTOS LTDA
7,18287639000165,21985.0,FLOR DO CERRADO HOSPEDAGEM
8,8928877000159,15941.05,IBIS FLORIANÓPOLIS FLEX HOTELARIA
9,4356463000141,15642.37,BRISTOL HOTEL LTDA
