# Invalid CNPJ or CPF from Federal Senate CEAP

`cnpj_cpf` is the column identifying the company or individual who received the payment made by the congressperson. Having this value empty should mean that it's an expense made outside Brazil, with a company (or person) without a Brazilian ID.

In [1]:
import numpy as np
import pandas as pd

from serenata_toolbox.datasets import fetch

fetch('2017-05-22-federal-senate-reimbursements.xz', '../data/')

Downloading 2017-05-22-federal-senate-reimbursements.xz: 100%|██████████| 3.59M/3.59M [00:00<00:00, 6.14Mb/s]


In [2]:
dataset = pd.read_csv('../data/2017-05-22-federal-senate-reimbursements.xz',\
                      converters={'cnpj_cpf': np.str}, encoding = 'utf-8')

In [3]:
dataset = dataset[dataset['cnpj_cpf'].notnull()]
dataset.head()

Unnamed: 0,year,month,congressperson_name,expense_type,cnpj_cpf,supplier,document_id,date,expense_details,reimbursement_value
0,2008,9,ADA MELLO,"Recruitment of consultancies, advisory service...",,,,,,1235152
1,2008,9,ADA MELLO,"Locomotion, lodging, food, fuels and lubricants",,,,,,3866
2,2008,10,ADA MELLO,"Recruitment of consultancies, advisory service...",,,,,,1235152
3,2008,10,ADA MELLO,"Locomotion, lodging, food, fuels and lubricants",,,,,,261068
4,2008,11,ADA MELLO,"Recruitment of consultancies, advisory service...",,,,,,1235152


In [4]:
from pycpfcnpj import cpfcnpj

def validate_cnpj_cpf(cnpj_or_cpf):
    return (cnpj_or_cpf == None) | cpfcnpj.validate(cnpj_or_cpf)



cnpj_cpf_list = dataset['cnpj_cpf'].astype(np.str).replace('nan', None)
dataset['valid_cnpj_cpf'] = np.vectorize(validate_cnpj_cpf)(cnpj_cpf_list)

In [5]:
dataset.query('valid_cnpj_cpf != True').head()

Unnamed: 0,year,month,congressperson_name,expense_type,cnpj_cpf,supplier,document_id,date,expense_details,reimbursement_value,valid_cnpj_cpf
0,2008,9,ADA MELLO,"Recruitment of consultancies, advisory service...",,,,,,1235152,False
1,2008,9,ADA MELLO,"Locomotion, lodging, food, fuels and lubricants",,,,,,3866,False
2,2008,10,ADA MELLO,"Recruitment of consultancies, advisory service...",,,,,,1235152,False
3,2008,10,ADA MELLO,"Locomotion, lodging, food, fuels and lubricants",,,,,,261068,False
4,2008,11,ADA MELLO,"Recruitment of consultancies, advisory service...",,,,,,1235152,False


So, this proves that we can find reimbursements without valid `cnpj_cpf`.

Plus, we need to add a `document_type` to the dataset to fit in the core module.

In [6]:
dataset['document_type'] = 'unknown'
dataset.iloc[0]

year                                                                2008
month                                                                  9
congressperson_name                                            ADA MELLO
expense_type           Recruitment of consultancies, advisory service...
cnpj_cpf                                                                
supplier                                                             NaN
document_id                                                          NaN
date                                                                 NaN
expense_details                                                      NaN
reimbursement_value                                             12351,52
valid_cnpj_cpf                                                     False
document_type                                                    unknown
Name: 0, dtype: object