# Expenses in closed companies
Recently we find out that there are many companies that are already closed, we are aiming to find if there is expenses made before the situation changed.

In [1]:
import pandas as pd
import numpy as np
from serenata_toolbox.datasets import fetch

fetch('2016-09-03-companies.xz', '../data')
fetch('2016-11-19-reimbursements.xz', '../data')

In [2]:
companies = pd.read_csv('../data/2016-09-03-companies.xz', low_memory=False)
reimbursements = pd.read_csv('../data/2016-11-19-reimbursements.xz',
                      dtype={'applicant_id': np.str,
                             'cnpj_cpf': np.str,
                             'congressperson_id': np.str,
                             'subquota_number': np.str},
                      low_memory=False)

Setting companies situation_date and reimbursements issue_date columns to correct date format, and set the cpnj to a format without dash and dots.

In [3]:
reimbursements['issue_date'] = pd.to_datetime(reimbursements['issue_date'], errors='coerce')
reimbursements['issue_date'] = reimbursements['issue_date'].apply(lambda date: date.date())
companies['situation_date'] = pd.to_datetime(companies['situation_date'], errors='coerce')
companies['situation_date'] = companies['situation_date'].apply(lambda date: date.date())
companies['cnpj'] = companies['cnpj'].str.replace(r'\D', '')

In [4]:
list(companies)[0:26]

['situation_date',
 'type',
 'name',
 'phone',
 'situation',
 'neighborhood',
 'address',
 'number',
 'zip_code',
 'city',
 'state',
 'opening',
 'legal_entity',
 'trade_name',
 'cnpj',
 'last_updated',
 'status',
 'additional_address_details',
 'email',
 'responsible_federative_entity',
 'situation_reason',
 'special_situation',
 'special_situation_date',
 'message',
 'main_activity_code',
 'main_activity']

In [5]:
statuses = ['BAIXADA', 'NULA', 'SUSPENSA', 'INAPTA']
not_opened = companies[companies['situation'].isin(statuses)]
not_opened[['cnpj', 'situation_date','situation', 'situation_reason']].head(5)

Unnamed: 0,cnpj,situation_date,situation,situation_reason
37,3956142000115,2005-09-20,BAIXADA,EXTINCAO P/ ENC LIQ VOLUNTARIA
248,8594693000108,2016-06-28,BAIXADA,EXTINCAO P/ ENC LIQ VOLUNTARIA
329,20768047000107,2016-12-04,BAIXADA,EXTINCAO P/ ENC LIQ VOLUNTARIA
364,3380051000346,2016-05-01,BAIXADA,EXTINCAO P/ ENC LIQ VOLUNTARIA
395,17479634000171,2016-06-28,BAIXADA,EXTINCAO P/ ENC LIQ VOLUNTARIA


The column situation_date is the one that is interesting. Expenses made after that date should be considered suspicious.

In [6]:
dataset = pd.merge(reimbursements, not_opened, how='inner',
                   left_on='cnpj_cpf', right_on='cnpj')

In [7]:
columns = ['issue_date','cnpj', 'situation_date','situation', 'situation_reason']
dataset = dataset[columns]
dataset.head(10)

Unnamed: 0,issue_date,cnpj,situation_date,situation,situation_reason
0,2009-04-06,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
1,2009-09-23,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
2,2009-10-14,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
3,2009-10-19,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
4,2009-05-29,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
5,2009-04-08,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
6,2009-07-01,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
7,2009-03-26,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
8,2009-04-07,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
9,2009-04-28,2989654001197,2013-01-03,BAIXADA,INCORPORACAO


The inner join on merge will give reimbursements that were requested for colsed companies. It is still necessary to check the reimbursement issue_date is "bigger" than the situation_date.

In [8]:
dataset.iloc[0]

issue_date              2009-04-06
cnpj                02989654001197
situation_date          2013-01-03
situation                  BAIXADA
situation_reason      INCORPORACAO
Name: 0, dtype: object

In [12]:
expenses_in_closed_companies = dataset.query('issue_date > situation_date')
expenses_in_closed_companies.head()

Unnamed: 0,issue_date,cnpj,situation_date,situation,situation_reason
2429,2013-01-30,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
2430,2013-02-02,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
2431,2013-02-26,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
2432,2013-03-01,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
2433,2013-01-28,2989654001197,2013-01-03,BAIXADA,INCORPORACAO
