In [1]:
import pandas as pd

from utils import get_collection, insert_df_to_db

In [2]:
deputies = pd.read_csv('../data/datalake/deputy_info.csv', 
                        usecols=['state', 'cpf', 'congressperson_name', 'congressperson_document', 'civil_name', 'party'])
deputies['cpf'] = deputies['cpf'].astype(str)
deputies.head()

Unnamed: 0,civil_name,state,cpf,congressperson_document,congressperson_name,party
0,FLAVIANO FLÁVIO BAPTISTA DE MELO,AC,33251797700,54,FLAVIANO MELO,PMDB
1,JANETE MARIA GÓES CAPIBERIBE,AP,18085830272,10,JANETE CAPIBERIBE,PSB
2,ALICE MAZZUCO PORTUGAL,BA,12377392504,180,ALICE PORTUGAL,PCdoB
3,BENITO DA GAMA SANTOS,BA,2664763504,190,BENITO GAMA,PTB
4,DANIEL GOMES DE ALMEIDA,BA,7894090549,188,DANIEL ALMEIDA,PCdoB


In [3]:
presences = pd.read_csv('../data/2017-05-29-presences.xz', 
                        usecols=['congressperson_document', 'date', 'party', 'state', 'congressperson_name'])

In [4]:
presences['date'] = pd.to_datetime(presences.date)
presences['year'] = presences.date.apply(lambda x: x.year)

In [5]:
presences['congressperson_name'] = presences.congressperson_name.apply(lambda x: x.split('-')[0].upper())

In [6]:
presences.head()

Unnamed: 0,congressperson_document,congressperson_name,party,state,date,year
0,361,GOULART,PSD,SP,2015-02-01 10:33:09,2015
1,361,GOULART,PSD,SP,2015-02-01 10:33:09,2015
2,361,GOULART,PSD,SP,2015-02-03 08:00:00,2015
3,361,GOULART,PSD,SP,2015-02-03 08:00:00,2015
4,361,GOULART,PSD,SP,2015-02-04 08:00:01,2015


In [7]:
df = pd.merge(deputies, presences, how='left', on=['congressperson_name', 'state'])
df.head()

Unnamed: 0,civil_name,state,cpf,congressperson_document_x,congressperson_name,party_x,congressperson_document_y,party_y,date,year
0,FLAVIANO FLÁVIO BAPTISTA DE MELO,AC,33251797700,54,FLAVIANO MELO,PMDB,54.0,PMDB,2015-02-01 10:33:09,2015.0
1,FLAVIANO FLÁVIO BAPTISTA DE MELO,AC,33251797700,54,FLAVIANO MELO,PMDB,54.0,PMDB,2015-02-01 10:33:09,2015.0
2,FLAVIANO FLÁVIO BAPTISTA DE MELO,AC,33251797700,54,FLAVIANO MELO,PMDB,54.0,PMDB,2015-02-03 08:00:00,2015.0
3,FLAVIANO FLÁVIO BAPTISTA DE MELO,AC,33251797700,54,FLAVIANO MELO,PMDB,54.0,PMDB,2015-02-03 08:00:00,2015.0
4,FLAVIANO FLÁVIO BAPTISTA DE MELO,AC,33251797700,54,FLAVIANO MELO,PMDB,54.0,PMDB,2015-02-04 08:00:01,2015.0


In [8]:
len(df[df.congressperson_document_x != df.congressperson_document_y])

13

In [9]:
len(df[df.party_x != df.party_y])

40133

## Insert to MongoDB

In [10]:
insert_df_to_db(deputies[['civil_name', 'state', 'cpf']])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


{'created': 114, 'updated': 287}

In [11]:
collection = get_collection()

In [12]:
deputies.head()

Unnamed: 0,civil_name,state,cpf,congressperson_document,congressperson_name,party
0,FLAVIANO FLÁVIO BAPTISTA DE MELO,AC,33251797700,54,FLAVIANO MELO,PMDB
1,JANETE MARIA GÓES CAPIBERIBE,AP,18085830272,10,JANETE CAPIBERIBE,PSB
2,ALICE MAZZUCO PORTUGAL,BA,12377392504,180,ALICE PORTUGAL,PCdoB
3,BENITO DA GAMA SANTOS,BA,2664763504,190,BENITO GAMA,PTB
4,DANIEL GOMES DE ALMEIDA,BA,7894090549,188,DANIEL ALMEIDA,PCdoB


In [18]:
for i, row in deputies.iterrows():
    document = row.congressperson_document
    cpf = str(row.cpf)
    
    document_info = {'begin': 2015, 'end': 2018, 'congressperson_document': document}

    person = collection.find_one({'cpf': cpf})
    
    deputy_info = person.get('deputy_info')
    if not deputy_info:
        person.update({'deputy_info': {'terms': [document_info]}})
    else:
        mandados = deputy_info.get('terms')
        if mandados:
            check = [m for m in mandados if m.get('congressperson_document') == document]
            if check:
                continue
            person['deputy_info']['terms'].append(document_info) 
        else:
            person['deputy_info'].update({'terms': [document_info]})
            
    collection.find_one_and_replace({'cpf': cpf}, person)