# Piaui Herald - Exploratory Data Analysis
Finding interesting cases for Rosie's column

In [1]:
import numpy as np
import pandas as pd

dataset = pd.read_csv('../../../serenata-data/2017-03-15-reimbursements.xz',
                      dtype={'applicant_id': np.str,
                             'cnpj_cpf': np.str,
                             'congressperson_id': np.str,
                             'subquota_number': np.str,
                             'document_id': np.int},
                      low_memory=False)

## Luxury Hotel

We are aiming to find suspicious expenses in hotels, maybe someone spent the holidays in some luxury hotel and asked for reimbursement

In [2]:
lodging = dataset[dataset['subquota_description'] == 'Lodging, except for congressperson from Distrito Federal']
keys = ['congressperson_id','cnpj_cpf', 'supplier']
grouped = lodging.groupby(keys)

Number of Lodging expenses

In [3]:
len(grouped)

21063

In [4]:
subquota_numbers = grouped['subquota_number'].agg(lambda x: ','.join(x)).reset_index()
subquota_numbers.head()

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,subquota_number
0,101309,2697172000191,HOTEL E RESTAURANTE CARAIVA,1414
1,101309,4652379000175,HOTEL ROYAL LTDA,1414141414
2,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,1414
3,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,14
4,101309,7881411000182,ITEM HOTELARIA TURISMO ESTACIONAMENTO P/ VEICU...,14


In [5]:
document_ids = grouped['document_id'].agg(lambda x: tuple(x)).reset_index()
document_ids.head()

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,document_id
0,101309,2697172000191,HOTEL E RESTAURANTE CARAIVA,"(5769892, 5769895)"
1,101309,4652379000175,HOTEL ROYAL LTDA,"(5789888, 5942187, 5978852, 6005283, 6019174)"
2,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,"(5796458, 5922475)"
3,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,"(6041598,)"
4,101309,7881411000182,ITEM HOTELARIA TURISMO ESTACIONAMENTO P/ VEICU...,"(5622880,)"


In [6]:
net_values_sum = grouped['total_net_value'].agg({'sum': np.sum}).reset_index()
net_values_sum.head()

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,sum
0,101309,2697172000191,HOTEL E RESTAURANTE CARAIVA,128.5
1,101309,4652379000175,HOTEL ROYAL LTDA,1890.9
2,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,617.7
3,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,953.4
4,101309,7881411000182,ITEM HOTELARIA TURISMO ESTACIONAMENTO P/ VEICU...,194.0


In [7]:
aggregation = pd.merge(pd.merge(subquota_numbers, document_ids, on=keys),
                       net_values_sum, on=keys)
aggregation.head()

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,subquota_number,document_id,sum
0,101309,2697172000191,HOTEL E RESTAURANTE CARAIVA,1414,"(5769892, 5769895)",128.5
1,101309,4652379000175,HOTEL ROYAL LTDA,1414141414,"(5789888, 5942187, 5978852, 6005283, 6019174)",1890.9
2,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,1414,"(5796458, 5922475)",617.7
3,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,14,"(6041598,)",953.4
4,101309,7881411000182,ITEM HOTELARIA TURISMO ESTACIONAMENTO P/ VEICU...,14,"(5622880,)",194.0


Get net value by row

In [8]:
def get_top_net_value(row):
    l = list(row['document_id'])
    values = []
    for reimbursement_id in l:
        values.append(float(dataset[dataset['document_id'] == reimbursement_id]['total_net_value']))
    return {'top_net_value':max(values), 'top_document':l[values.index(max(values))]}

In [9]:
top_things = aggregation.apply(func=get_top_net_value, axis='columns')
# new columns
aggregation['top_net_value'], aggregation['top_document'] = "",""

In [10]:
for _ in range(len(top_things)):
    # paliative since DataFrame.replace() did not work ¯\_(ツ)_/¯
    aggregation.loc[_, 'top_net_value'] = top_things[_]['top_net_value']
    aggregation.loc[_, 'top_document'] = top_things[_]['top_document']

In [11]:
aggregation.head()

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,subquota_number,document_id,sum,top_net_value,top_document
0,101309,2697172000191,HOTEL E RESTAURANTE CARAIVA,1414,"(5769892, 5769895)",128.5,79.0,5769892
1,101309,4652379000175,HOTEL ROYAL LTDA,1414141414,"(5789888, 5942187, 5978852, 6005283, 6019174)",1890.9,822.7,5978852
2,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,1414,"(5796458, 5922475)",617.7,430.7,5796458
3,101309,7881411000182,ITEM HOTELARIA TURISMO E ESTACIONAMENTO PARA V...,14,"(6041598,)",953.4,953.4,6041598
4,101309,7881411000182,ITEM HOTELARIA TURISMO ESTACIONAMENTO P/ VEICU...,14,"(5622880,)",194.0,194.0,5622880


In [12]:
aggregation = aggregation.sort_values(by='top_net_value', ascending=False)
aggregation.head(10)

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,subquota_number,document_id,sum,top_net_value,top_document
9102,160633,7959819000120,KAPITAL VIAG. E TUR. LTDA,141414141414,"(5877120, 5952491, 5952505, 5988139, 6025233, ...",24842.08,11090.1,5877120
17690,74319,4443021000213,EUROPA INVESTIMENTOS LTDA,"14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,1...","(5037717, 5054130, 5054141, 5054148, 5072351, ...",139915.0,10665.0,5322461
8366,160606,5668014000100,D.A.N HOTEL SP LTDA,141414141414141414141414141414,"(2301718, 2385930, 2414282, 2443857, 5058637, ...",53664.07,10170.0,5652142
6509,160549,40848905000130,WM TURISMO E VIAGENS LTDA.,14,"(2349567,)",9316.2,9316.2,2349567
13162,73437,58133323000133,VASCO DA GAMA TURISMO LTDA,1414,"(5168791, 5487203)",16000.0,9000.0,5487203
12678,178981,61223608000170,MEGTUR AGENCIA DE VIAGENS E TURISMO LTDA,14,"(5794903,)",8510.0,8510.0,5794903
17693,74319,4863492000108,RIO POTY HOTEL,"14,14,14,14,14,14,14,14,14,14,14,14,14,14,14,1...","(1643367, 1643368, 1839328, 1839336, 1867172, ...",61400.0,8050.0,1955426
11387,178847,16978175000108,Adria viagens e turismo LTDA ME,141414141414141414141414,"(5677346, 5677362, 5781905, 5781908, 5803661, ...",34425.02,7904.84,5884760
17250,74207,4433548000186,EMILIANO EMPREENDIMENTOS E PARTICIPAÇÕES HOTEL...,141414141414,"(5150333, 5237164, 5346066, 5443587, 5531077, ...",32982.39,7697.22,5443587
17121,74177,34776559000120,NOVO HOTEL,14141414141414141414,"(5207833, 5349869, 5366314, 5410698, 5467753, ...",32144.0,7600.0,5497082


## Top eaters

Who were the congresspeople that ate more in one day and when?

In [99]:
meals = dataset[dataset['subquota_description'] == 'Congressperson meal']
meals = meals.reset_index()

In [100]:
keys = ['congressperson_name', 'issue_date']
meals_aggregation = meals.groupby(keys)['total_net_value']. \
                        agg({'sum': np.sum, 'expenses': len, 'mean': np.mean})
meals_aggregation['expenses'] = meals_aggregation['expenses'].astype(np.int)

In [101]:
meals_aggregation.sort_values(['expenses', 'sum'], ascending=[False, False]).head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,expenses,mean
congressperson_name,issue_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CELSO MALDANER,2011-09-05 00:00:00.0,750.28,13,57.713846
JOSÉ PAULO TÓFFANO,2010-04-27 00:00:00.0,558.07,13,42.928462
SANDRA ROSADO,2012-01-12 00:00:00.0,333.4,12,27.783333
SANDRA ROSADO,2012-01-17 00:00:00.0,287.43,12,23.9525
SANDRA ROSADO,2012-01-06 00:00:00.0,281.75,12,23.479167
LÉO VIVAS,2010-08-31 00:00:00.0,630.0,11,57.272727
SANDRA ROSADO,2012-01-11 00:00:00.0,541.56,11,49.232727
PAULO WAGNER,2011-07-21 00:00:00.0,537.66,11,48.878182
SANDRA ROSADO,2015-01-07 00:00:00.0,396.6,11,36.054545
SANDRA ROSADO,2012-01-15 00:00:00.0,295.58,11,26.870909


What was the highest meal reimbursement, where it was made and by who?

In [102]:
meals[['document_id', 'issue_date', 'total_net_value', 'congressperson_name', 'supplier']]. \
        sort_values('total_net_value', ascending=False).head(10)

Unnamed: 0,document_id,issue_date,total_net_value,congressperson_name,supplier
151919,5857053,2015-11-25 00:00:00.0,6205.0,LIDERANÇA DO PT,CENTRAL PARK RESTAURANTE E EVENTOS LTDA ME
148474,5588112,2015-01-18 00:00:00.0,5880.0,MARLLOS SAMPAIO,VR COMERCIO DE VEICULOS LTDA-ME
1628,1689682,2009-09-01 00:00:00.0,5852.0,MANOEL SALVIANO,HOTEL NACIONAL S/A
84828,2383218,2012-08-22 00:00:00.0,5696.0,LIDERANÇA DO PT,PARRILLA FIGUEIRA DA VILLA
182594,5954206,2016-03-30T00:00:00,5142.25,LIDERANÇA DO PT,CENTRAL PARK RESTAURANTE E EVENTOS LTDA ME
151914,5834088,2015-11-03 00:00:00.0,5054.99,LIDERANÇA DO PT,CENTRAL PARK RESTAURANTE E EVENTOS LTDA ME
132370,5573885,2014-12-10 00:00:00.0,5050.0,LIDERANÇA DO PT,CENTRAL PARK RESTAURANTE E EVENTOS LTDA ME
109870,5056362,2013-03-18 00:00:00.0,4990.0,LIDERANÇA DO PSDB,Emporio Albamonte Comercio e Serviços Alimentí...
1631,1717750,2009-11-03 00:00:00.0,4840.0,MANOEL SALVIANO,HOTEL NACIONAL S/A
36007,2195568,2011-11-02 00:00:00.0,4510.0,SALVADOR ZIMBALDI,HOTEL NACIONAL SA


## Bode assado

In [103]:
from re import compile, search

In [104]:
len(meals)

197237

In [112]:
bode = re.compile('BODE')
bode_bool = []
for _ in range(len(meals)):
    if bode.search(meals.loc[_, 'supplier']):
        bode_bool.append(True)
    else:
        bode_bool.append(False)

In [144]:
bode_meals = meals[bode_bool]
keys = ['congressperson_id','cnpj_cpf', 'supplier', 'issue_date', 'document_id']
bode_meals_grouped = bode_meals.groupby(keys)
bode_meals_grouped.head()

Unnamed: 0,index,year,applicant_id,document_id,reimbursement_value_total,total_net_value,reimbursement_numbers,congressperson_name,congressperson_id,congressperson_document,...,issue_date,document_value,remark_value,net_values,month,installment,passenger,leg_of_the_trip,batch_number,reimbursement_values
2031,27773,2009,137,1723164,,92.18,3129,GONZAGA PATRIOTA,74419,143.0,...,2009-11-15 00:00:00.0,99.22,7.04,92.18,11,0,,,437960,
2032,27774,2009,137,1724236,,7.04,3137,GONZAGA PATRIOTA,74419,143.0,...,2009-11-15 00:00:00.0,7.04,0.00,7.04,11,0,,,437960,
2208,36156,2009,1455,1660201,,14.00,3046,LUIS CARLOS HEINZE,73483,500.0,...,2009-07-09 00:00:00.0,14.00,0.00,14.00,7,0,,,419418,
2234,36236,2009,1455,1676470,,20.00,3079,LUIS CARLOS HEINZE,73483,500.0,...,2009-08-23 00:00:00.0,20.00,0.00,20.00,8,0,,,424313,
2241,36245,2009,1455,1676509,,14.00,3079,LUIS CARLOS HEINZE,73483,500.0,...,2009-08-31 00:00:00.0,14.00,0.00,14.00,8,0,,,424313,
2261,36452,2009,1455,1703745,,14.00,3109,LUIS CARLOS HEINZE,73483,500.0,...,2009-09-11 00:00:00.0,14.00,0.00,14.00,9,0,,,432250,
2275,36545,2009,1455,1722763,,14.00,3129,LUIS CARLOS HEINZE,73483,500.0,...,2009-10-08 00:00:00.0,14.00,0.00,14.00,10,0,,,437831,
2281,36684,2009,1455,1749643,,12.50,3168,LUIS CARLOS HEINZE,73483,500.0,...,2009-11-19 00:00:00.0,12.50,0.00,12.50,11,0,,,446024,
2619,41752,2009,1544,1645842,,39.00,3017,LUIZ BASSUMA,74059,182.0,...,2009-08-09 00:00:00.0,39.00,0.00,39.00,8,0,,,415238,
4213,69076,2009,1710,1667521,,44.50,3056,ZEZÉU RIBEIRO,74145,217.0,...,2009-09-07 00:00:00.0,44.50,0.00,44.50,9,0,,,421639,


In [145]:
bode_meals_grouped = bode_meals_grouped['total_net_value'].agg({'expense': np.sum}).reset_index()
bode_meals_grouped = bode_meals_grouped.sort_values('expense', ascending=False)
bode_meals_grouped.head(20)

Unnamed: 0,congressperson_id,cnpj_cpf,supplier,issue_date,document_id,expense
250,74419,3487767000185,GERALDO BODE ASSADO,2011-02-13 00:00:00.0,2006450,1493.0
249,74419,3487767000185,GERALDO BODE ASSADO,2010-08-02 00:00:00.0,1905263,480.0
240,74419,3487767000185,GERALDO BODE ASSADO,2010-06-18 00:00:00.0,1882756,440.0
529,74419,16814703000193,BODEGA DA SERRA,2014-07-21 00:00:00.0,5476790,250.0
516,74419,3487767000185,GERALDO BODE ASSADO,2012-01-07 00:00:00.0,2249894,245.0
252,74419,3487767000185,GERALDO BODE ASSADO,2012-01-15 00:00:00.0,2249881,243.0
277,74419,3487767000185,GERALDO BODE ASSADO,2013-03-23 00:00:00.0,5061745,240.0
515,74419,3487767000185,GERALDO BODE ASSADO,2012-01-02 00:00:00.0,2249896,210.0
264,74419,3487767000185,GERALDO BODE ASSADO,2012-08-04 00:00:00.0,2385459,210.0
164,163322,3459794000144,ENTRE AMIGOS O BODE LTDA,2013-12-07 00:00:00.0,5303369,209.22
