# Generate statistics to be used on the new Serenata website

Now in the end of the project we need some statistics for the new website.


In [3]:
import numpy as np
import pandas as pd

dataset = pd.read_csv('../data/2016-11-19-reimbursements.xz',
                      dtype={'applicant_id': np.str,
                             'cnpj_cpf': np.str,
                             'congressperson_id': np.str,
                             'subquota_number': np.str},
                      low_memory=False)

We will need formated data for the analysis down the road:

In [4]:
dataset['issue_date'] = pd.to_datetime(dataset['issue_date'], errors='coerce')
dataset['issue_date_day'] = dataset['issue_date'].apply(lambda date: date.day)
dataset['issue_date_month'] = dataset['issue_date'].apply(lambda date: date.month)
dataset['issue_date_year'] = dataset['issue_date'].apply(lambda date: date.year)
dataset['issue_date_weekday'] = dataset['issue_date'].apply(lambda date: date.weekday())
dataset['issue_date_week'] = dataset['issue_date'].apply(lambda date: date.week)

## Total spent in one year

We want to see how much was spent in reimbursements in one year.
The dataset goes from 2009 to 2016.

In [3]:
years = [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016]
for i in years:
    print(i, ': ', sum(dataset[dataset['issue_date_year'] == i]['total_net_value']))

2009 :  90246572.85
2010 :  120169686.2
2011 :  123799712.0
2012 :  122853241.79
2013 :  134535007.65
2014 :  143424839.27
2015 :  160374692.14
2016 :  134616207.06


And the average considering all eight years:

In [4]:
sum(dataset['total_net_value']) / 8

128767695.58984885

Would be nice to know on average how much is spent in one day of reibursments for one congress person

In [5]:
keys = ['congressperson_name', 'issue_date']                                    
aggregation = dataset.groupby(keys)['total_net_value']. \
            agg({'sum': np.sum, 'expenses': len, 'median': np.median})
aggregation['expenses'] = aggregation['expenses'].astype(np.int)
print(aggregation['median'].median())

122.9


## Some subquotas
### Meals

In [7]:
meals_dataset = dataset[dataset['subquota_description'] == 'Congressperson meal']
meals_dataset.shape

(191724, 36)

In [8]:
meals_dataset.head()

Unnamed: 0,year,applicant_id,document_id,reimbursement_value_total,total_net_value,reimbursement_numbers,congressperson_name,congressperson_id,congressperson_document,term,...,installment,passenger,leg_of_the_trip,batch_number,reimbursement_values,issue_date_day,issue_date_month,issue_date_year,issue_date_weekday,issue_date_week
159,2009,1001,1627199,,122.43,2986,DILCEU SPERAFICO,73768,444.0,2015.0,...,0,,,410209,,7.0,7.0,2009.0,1.0,28.0
196,2009,1001,1635276,,78.0,3006,DILCEU SPERAFICO,73768,444.0,2015.0,...,0,,,412206,,22.0,7.0,2009.0,2.0,30.0
380,2009,1001,1709015,,76.45,3112,DILCEU SPERAFICO,73768,444.0,2015.0,...,0,,,433750,,29.0,10.0,2009.0,3.0,44.0
407,2009,1001,1722100,,68.88,3129,DILCEU SPERAFICO,73768,444.0,2015.0,...,0,,,437649,,12.0,11.0,2009.0,3.0,46.0
414,2009,1001,1726433,,104.15,3133,DILCEU SPERAFICO,73768,444.0,2015.0,...,0,,,438905,,19.0,11.0,2009.0,3.0,47.0


I want to find which was the highest value reimbursed to a congress person
for one meal

In [9]:
max(meals_dataset['total_net_value'])

6205.0

Now let's check what was top meal reibursments made in one day
highest number of meals reibursments in one day

In [11]:
keys = ['congressperson_name', 'issue_date']
meals_aggregation = meals_dataset.groupby(keys)['total_net_value'].agg({'sum': np.sum, 'expenses': len, 'mean': np.mean})
meals_aggregation['expenses'] = meals_aggregation['expenses'].astype(np.int)
meals_aggregation.sort_values(['expenses', 'sum'], ascending=[False, False]).head(10)
max(meals_aggregation['expenses'])

13

## Other sub-quotas
Not all sub-quotas have an ceiling (e.g.: Congressperson Meal), considering these four that have,
how many congress people use the whole sub-quota monthly

In [13]:
SUB_QUOTAS = {
        'Fuels and lubricants': 6000,
        'Automotive vehicle renting or charter': 10900,
        'Taxi, toll and parking': 2700,
        'Security service provided by specialized company': 8700,
}

keys = ['congressperson_name', 'issue_date_month', 'issue_date_year']
for i in SUB_QUOTAS:
    subquotas = dataset
    subquotas = subquotas[subquotas['subquota_description'] == i]
    subquotas_agg = subquotas.groupby(keys)['total_net_value'].agg({'sum': np.sum, 'expenses': len, 'mean': np.mean})              
    subquotas_agg['expenses'] = subquotas_agg['expenses'].astype(np.int)
    subquotas_agg = subquotas_agg[subquotas_agg['sum'] == SUB_QUOTAS[i]]
    print(i, len(set(list(subquotas_agg.index.get_level_values(0)))))

Fuels and lubricants 156
Taxi, toll and parking 5
Security service provided by specialized company 9
Automotive vehicle renting or charter 49
