In [1]:
import pandas as pd

from utils import insert_df_to_db

## Getting deputies complete information

In [2]:
tse_candidates = pd.read_csv('../data/2017-07-20-tse-candidates.xz', low_memory=False)
tse_candidates.head()

Unnamed: 0,year,phase,description,state,location,post,name,electoral_id,cpf,voter_id,result
0,2004,1,ELEICOES 2004,AC,ACRELANDIA,mayor,JOÃO DE ALMEIDA,6,18139124249,356682496,not_elected
1,2004,1,ELEICOES 2004,AC,ACRELANDIA,mayor,PAULO CÉSAR FERREIRA DE ARAUJO,1,7857136204,525772445,not_elected
2,2004,1,ELEICOES 2004,AC,ACRELANDIA,mayor,SEBASTIÃO BOCALOM RODRIGUES,27,17357152987,2053882488,elected
3,2004,1,ELEICOES 2004,AC,ACRELANDIA,vice_mayor,ERAIDES CAETANO DE SOUZA,7,40917860900,2817512496,
4,2004,1,ELEICOES 2004,AC,ACRELANDIA,vice_mayor,RICARDO MODESTO DE OLIVEIRA,2,61109029268,2600402445,


In [3]:
# select only candidates to federal deputy
deputy_candidates = tse_candidates[tse_candidates.post=='federal_deputy']

# select wanted features
deputy_candidates = deputy_candidates[['name', 'state', 'cpf', 'voter_id']]

# rename column
deputy_candidates.rename(columns={'name': 'civil_name'}, inplace=True)

# remove duplicates
deputy_candidates.drop_duplicates(subset='cpf', inplace=True)
deputy_candidates.head(2)

Unnamed: 0,civil_name,state,cpf,voter_id
401811,ALDEMIR LOPES DA SILVA,AC,571202268,1361112429
401812,ALUÍZIO BEZERRA DE OLIVEIRA,AC,340243104,541372402


In [4]:
deputies = pd.read_csv('../data/2017-05-29-deputies.xz')
deputies = deputies[[
    'congressperson_id', 
    'congressperson_document', 
    'civil_name', 
    'congressperson_name',
    'gender',
    'state',
    'party'
]]
deputies.head(2)

Unnamed: 0,congressperson_id,congressperson_document,civil_name,congressperson_name,gender,state,party
0,178980,361,ANTONIO GOULART DOS REIS,GOULART,male,SP,PSD
1,141335,19,JOSÉ ROBERTO OLIVEIRA FARO,BETO FARO,male,PA,PT


In [5]:
deputies_personal_info = pd.merge(deputy_candidates, deputies, how='inner', on=['civil_name', 'state'])
deputies_personal_info.head()

Unnamed: 0,civil_name,state,cpf,voter_id,congressperson_id,congressperson_document,congressperson_name,gender,party
0,FLAVIANO FLÁVIO BAPTISTA DE MELO,AC,33251797700,745162429,141434,54,FLAVIANO MELO,male,PMDB
1,JANETE MARIA GÓES CAPIBERIBE,AP,18085830272,69332577,73926,10,JANETE CAPIBERIBE,female,PSB
2,ALICE MAZZUCO PORTUGAL,BA,12377392504,24700970558,74057,180,ALICE PORTUGAL,female,PCdoB
3,BENITO DA GAMA SANTOS,BA,2664763504,5940340566,74535,190,BENITO GAMA,male,PTB
4,DANIEL GOMES DE ALMEIDA,BA,7894090549,30538180515,74060,188,DANIEL ALMEIDA,male,PCdoB


In [6]:
dd = tse_candidates.groupby(['cpf', 'result'])['result'].count()

In [7]:
elections = dd.unstack()
elections.head()

result,alternate,elected,elected_by_party_quota,not_elected,rejected,replaced,runoff
cpf,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
#NULO#,249.0,57.0,,108.0,7.0,1.0,
0,2.0,1.0,,,,,
00000000000,4.0,,,4.0,1.0,,
00000012017,1.0,,,,,,
00000165000,1.0,,,,,,


In [8]:
df = pd.merge(deputies_personal_info, elections, on='cpf')
df.head()

Unnamed: 0,civil_name,state,cpf,voter_id,congressperson_id,congressperson_document,congressperson_name,gender,party,alternate,elected,elected_by_party_quota,not_elected,rejected,replaced,runoff
0,FLAVIANO FLÁVIO BAPTISTA DE MELO,AC,33251797700,745162429,141434,54,FLAVIANO MELO,male,PMDB,,2.0,1.0,,,,
1,JANETE MARIA GÓES CAPIBERIBE,AP,18085830272,69332577,73926,10,JANETE CAPIBERIBE,female,PSB,,2.0,1.0,1.0,,,
2,ALICE MAZZUCO PORTUGAL,BA,12377392504,24700970558,74057,180,ALICE PORTUGAL,female,PCdoB,,2.0,1.0,1.0,,,
3,BENITO DA GAMA SANTOS,BA,2664763504,5940340566,74535,190,BENITO GAMA,male,PTB,2.0,1.0,,1.0,,,
4,DANIEL GOMES DE ALMEIDA,BA,7894090549,30538180515,74060,188,DANIEL ALMEIDA,male,PCdoB,,2.0,1.0,1.0,,,


In [9]:
df.shape

(401, 16)

In [10]:
df.columns

Index(['civil_name', 'state', 'cpf', 'voter_id', 'congressperson_id',
       'congressperson_document', 'congressperson_name', 'gender', 'party',
       'alternate', 'elected', 'elected_by_party_quota', 'not_elected',
       'rejected', 'replaced', 'runoff'],
      dtype='object')

In [11]:
df.drop(['alternate', 'elected', 'elected_by_party_quota', 'not_elected',
       'rejected', 'replaced', 'runoff'], axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,civil_name,state,cpf,voter_id,congressperson_id,congressperson_document,congressperson_name,gender,party
0,FLAVIANO FLÁVIO BAPTISTA DE MELO,AC,33251797700,745162429,141434,54,FLAVIANO MELO,male,PMDB
1,JANETE MARIA GÓES CAPIBERIBE,AP,18085830272,69332577,73926,10,JANETE CAPIBERIBE,female,PSB
2,ALICE MAZZUCO PORTUGAL,BA,12377392504,24700970558,74057,180,ALICE PORTUGAL,female,PCdoB
3,BENITO DA GAMA SANTOS,BA,2664763504,5940340566,74535,190,BENITO GAMA,male,PTB
4,DANIEL GOMES DE ALMEIDA,BA,7894090549,30538180515,74060,188,DANIEL ALMEIDA,male,PCdoB


In [13]:
df.to_csv('../data/datalake/deputy_info.csv', index=False)

# Adding info on MongoDB

In [14]:
personal_info = df[['cpf', 'civil_name', 'gender']]
insert_df_to_db(personal_info)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


{'created': 348, 'updated': 53}

In [15]:
deputy_info = df[['cpf', 'congressperson_id', 'congressperson_document', 'congressperson_name']]
insert_df_to_db(deputy_info, 'deputy_info')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


{'created': 0, 'updated': 401}