# Piaui Herald - Exploratory Data Analysis
Finding interesting cases for Rosie's column

In [1]:
import numpy as np
import pandas as pd

dataset = pd.read_csv('../../../serenata-data/2017-03-15-reimbursements.xz',
                      dtype={'applicant_id': np.str,
                             'cnpj_cpf': np.str,
                             'congressperson_id': np.str,
                             'subquota_number': np.str,
                             'document_id': np.int},
                      low_memory=False)

## Luxury Hotel

In [2]:
lodging = dataset[dataset['subquota_description'] == 'Lodging, except for congressperson from Distrito Federal']
keys = ['congressperson_id','cnpj_cpf', 'supplier']
grouped = lodging.groupby(keys)

Number of Lodging expenses

In [3]:
len(grouped)

21063

In [4]:
subquota_numbers = grouped['subquota_number'].agg(lambda x: ','.join(x)).reset_index()
subquota_numbers.head()

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,subquota_number
0,101309,2697172000191,HOTEL E RESTAURANTE CARAIVA,1414
1,101309,4652379000175,HOTEL ROYAL LTDA,1414141414
2,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,1414
3,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,14
4,101309,7881411000182,ITEM HOTELARIA TURISMO ESTACIONAMENTO P/ VEICU...,14


In [5]:
document_ids = grouped['document_id'].agg(lambda x: tuple(x)).reset_index()
document_ids.head()

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,document_id
0,101309,2697172000191,HOTEL E RESTAURANTE CARAIVA,"(5769892, 5769895)"
1,101309,4652379000175,HOTEL ROYAL LTDA,"(5789888, 5942187, 5978852, 6005283, 6019174)"
2,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,"(5796458, 5922475)"
3,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,"(6041598,)"
4,101309,7881411000182,ITEM HOTELARIA TURISMO ESTACIONAMENTO P/ VEICU...,"(5622880,)"


In [6]:
net_values_sum = grouped['total_net_value'].agg({'sum': np.sum}).reset_index()
net_values_sum.head()

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,sum
0,101309,2697172000191,HOTEL E RESTAURANTE CARAIVA,128.5
1,101309,4652379000175,HOTEL ROYAL LTDA,1890.9
2,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,617.7
3,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,953.4
4,101309,7881411000182,ITEM HOTELARIA TURISMO ESTACIONAMENTO P/ VEICU...,194.0


In [7]:
aggregation = pd.merge(pd.merge(subquota_numbers, document_ids, on=keys),
                       net_values_sum, on=keys)
aggregation.head()

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,subquota_number,document_id,sum
0,101309,2697172000191,HOTEL E RESTAURANTE CARAIVA,1414,"(5769892, 5769895)",128.5
1,101309,4652379000175,HOTEL ROYAL LTDA,1414141414,"(5789888, 5942187, 5978852, 6005283, 6019174)",1890.9
2,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,1414,"(5796458, 5922475)",617.7
3,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,14,"(6041598,)",953.4
4,101309,7881411000182,ITEM HOTELARIA TURISMO ESTACIONAMENTO P/ VEICU...,14,"(5622880,)",194.0


Get net value by row

In [8]:
def get_top_net_value(row):
    l = list(row['document_id'])
    values = []
    for reimbursement_id in l:
        values.append(float(dataset[dataset['document_id'] == reimbursement_id]['total_net_value']))
    return {'top_net_value':max(values), 'top_document':l[values.index(max(values))]}

In [9]:
top_things = aggregation.apply(func=get_top_net_value, axis='columns')
# new columns
aggregation['top_net_value'], aggregation['top_document'] = "",""

In [10]:
for _ in range(len(top_things)):
    # paliative since DataFrame.replace() did not work ¯\_(ツ)_/¯
    aggregation.loc[_, 'top_net_value'] = top_things[_]['top_net_value']
    aggregation.loc[_, 'top_document'] = top_things[_]['top_document']

In [11]:
aggregation.head()

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,subquota_number,document_id,sum,top_net_value,top_document
0,101309,2697172000191,HOTEL E RESTAURANTE CARAIVA,1414,"(5769892, 5769895)",128.5,79.0,5769892
1,101309,4652379000175,HOTEL ROYAL LTDA,1414141414,"(5789888, 5942187, 5978852, 6005283, 6019174)",1890.9,822.7,5978852
2,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,1414,"(5796458, 5922475)",617.7,430.7,5796458
3,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,14,"(6041598,)",953.4,953.4,6041598
4,101309,7881411000182,ITEM HOTELARIA TURISMO ESTACIONAMENTO P/ VEICU...,14,"(5622880,)",194.0,194.0,5622880


In [12]:
aggregation = aggregation.sort_values(by='top_net_value', ascending=False)
aggregation.head(10)

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,subquota_number,document_id,sum,top_net_value,top_document
9102,160633,7959819000120,KAPITAL VIAG. E TUR. LTDA,141414141414,"(5877120, 5952491, 5952505, 5988139, 6025233, ...",24842.08,11090.1,5877120
17690,74319,4443021000213,EUROPA INVESTIMENTOS LTDA,"14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,1...","(5037717, 5054130, 5054141, 5054148, 5072351, ...",139915.0,10665.0,5322461
8366,160606,5668014000100,D.A.N HOTEL SP LTDA,141414141414141414141414141414,"(2301718, 2385930, 2414282, 2443857, 5058637, ...",53664.07,10170.0,5652142
6509,160549,40848905000130,WM TURISMO E VIAGENS LTDA.,14,"(2349567,)",9316.2,9316.2,2349567
13162,73437,58133323000133,VASCO DA GAMA TURISMO LTDA,1414,"(5168791, 5487203)",16000.0,9000.0,5487203
12678,178981,61223608000170,MEGTUR AGENCIA DE VIAGENS E TURISMO LTDA,14,"(5794903,)",8510.0,8510.0,5794903
17693,74319,4863492000108,RIO POTY HOTEL,"14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,1...","(1643367, 1643368, 1839328, 1839336, 1867172, ...",61400.0,8050.0,1955426
11387,178847,16978175000108,Adria viagens e turismo LTDA ME,141414141414141414141414,"(5677346, 5677362, 5781905, 5781908, 5803661, ...",34425.02,7904.84,5884760
17250,74207,4433548000186,EMILIANO EMPREENDIMENTOS E PARTICIPAÇÕES HOTEL...,141414141414,"(5150333, 5237164, 5346066, 5443587, 5531077, ...",32982.39,7697.22,5443587
17121,74177,34776559000120,NOVO HOTEL,14141414141414141414,"(5207833, 5349869, 5366314, 5410698, 5467753, ...",32144.0,7600.0,5497082
