In [1]:
import pandas as pd

from utils import get_collection

In [2]:
deputies = pd.read_csv('../data/datalake/deputy_info.csv', dtype={'cpf': str})
deputies.head()

Unnamed: 0,civil_name,state,cpf,voter_id,congressperson_id,congressperson_document,congressperson_name,gender,party
0,FLAVIANO FLÁVIO BAPTISTA DE MELO,AC,33251797700,745162429,141434,54,FLAVIANO MELO,male,PMDB
1,JANETE MARIA GÓES CAPIBERIBE,AP,18085830272,69332577,73926,10,JANETE CAPIBERIBE,female,PSB
2,ALICE MAZZUCO PORTUGAL,BA,12377392504,24700970558,74057,180,ALICE PORTUGAL,female,PCdoB
3,BENITO DA GAMA SANTOS,BA,2664763504,5940340566,74535,190,BENITO GAMA,male,PTB
4,DANIEL GOMES DE ALMEIDA,BA,7894090549,30538180515,74060,188,DANIEL ALMEIDA,male,PCdoB


In [3]:
presences = pd.read_csv('../data/2017-05-29-presences.xz')

In [4]:
presences['date'] = pd.to_datetime(presences.date)
presences['year'] = presences.date.apply(lambda x: x.year)

In [5]:
presences['congressperson_name'] = presences.congressperson_name.apply(lambda x: x.split('-')[0].upper())

In [6]:
presences.head()

Unnamed: 0,term,congressperson_document,congressperson_name,party,state,date,present_on_day,justification,session,presence,year
0,55,361,GOULART,PSD,SP,2015-02-01 10:33:09,Present,,SESSÃO PREPARATÓRIA Nº 001 - 01/02/2015,Present,2015
1,55,361,GOULART,PSD,SP,2015-02-01 10:33:09,Present,,SESSÃO PREPARATÓRIA Nº 002 - 01/02/2015,Present,2015
2,55,361,GOULART,PSD,SP,2015-02-03 08:00:00,Present,,ORDINÁRIA Nº 001 - 03/02/2015,Present,2015
3,55,361,GOULART,PSD,SP,2015-02-03 08:00:00,Present,,EXTRAORDINÁRIA Nº 002 - 03/02/2015,Present,2015
4,55,361,GOULART,PSD,SP,2015-02-04 08:00:01,Present,,ORDINÁRIA Nº 003 - 04/02/2015,Present,2015


In [7]:
presences.present_on_day.value_counts()

Present              205252
Justified absence     17017
Absent                 5418
Present (~)             265
Name: present_on_day, dtype: int64

In [8]:
presences['was_present'] = presences.present_on_day.apply(lambda x: 1 if x in ['Present', 'Present (~)'] else 0)
presences['was_absent'] = presences.present_on_day.apply(lambda x: 1 if x in ['Absent', 'Justified absence'] else 0)
presences['was_absent_with_justification']= presences.present_on_day.apply(lambda x: 1 if x=='Justified absence' else 0)

In [9]:
df = pd.merge(presences, deputies, on=['congressperson_name', 'state', 'congressperson_document'])

In [10]:
dd = df[df.was_absent_with_justification>0].groupby(['cpf', 'year'])['was_absent_with_justification'].count().unstack()
dd.loc['10655365770']

year
2015    2.0
2016    3.0
2017    NaN
Name: 10655365770, dtype: float64

In [11]:
df.head(2)

Unnamed: 0,term,congressperson_document,congressperson_name,party_x,state,date,present_on_day,justification,session,presence,year,was_present,was_absent,was_absent_with_justification,civil_name,cpf,voter_id,congressperson_id,gender,party_y
0,55,361,GOULART,PSD,SP,2015-02-01 10:33:09,Present,,SESSÃO PREPARATÓRIA Nº 001 - 01/02/2015,Present,2015,1,0,0,ANTONIO GOULART DOS REIS,76026329820,116344770167,178980,male,PSD
1,55,361,GOULART,PSD,SP,2015-02-01 10:33:09,Present,,SESSÃO PREPARATÓRIA Nº 002 - 01/02/2015,Present,2015,1,0,0,ANTONIO GOULART DOS REIS,76026329820,116344770167,178980,male,PSD


In [12]:
def get_presence_count_by_year(cpf, year):
    return df[(df.cpf==cpf) & (df.year==year) & (df.was_present>0)].shape[0]

def get_abstences_count_by_year(cpf, year):
    return df[(df.cpf==cpf) & (df.year==year) & (df.was_absent>0)].shape[0]

def get_justification_count_by_year(cpf, year):
    return df[(df.cpf==cpf) & (df.year==year) & (df.was_absent_with_justification>0)].shape[0]

In [13]:
collection = get_collection()

In [21]:
for cpf in df.cpf.unique():
    cpf = str(cpf)

    data = {}
    for year in df.year.unique():
        sessions_present = get_presence_count_by_year(cpf, year)
        sessions_absent = get_abstences_count_by_year(cpf, year)
        sessions_absent_with_justification = get_justification_count_by_year(cpf, year)      
        total_sessions = sessions_present + sessions_absent
        
        sessions_present_percent = 0
        sessions_abscent_percent = 0
        sessions_abscent_justified_percent = 0
        
        if total_sessions:
            sessions_present_percent = sessions_present / total_sessions
            sessions_abscent_percent = sessions_absent / total_sessions
        if sessions_absent:
            sessions_abscent_justified_percent = sessions_absent_with_justification / sessions_absent
        
        
        data.update({
            'year_{}'.format(year): {
                'total_sessions': total_sessions,
                'present': sessions_present,
                'abstent': sessions_absent,
                'abstent_with_justification': sessions_absent_with_justification,
                'present_percent': sessions_present_percent,
                'abstent_percent': sessions_abscent_percent,
                'abstent_with_justification_percent': sessions_abscent_justified_percent,
            }
        })
        
    person = collection.find_one({'cpf': cpf})
    presence_info = person.get('sessions_presence')
    if not presence_info:
        person.update({'sessions_presence': data})
    else:
        person['sessions_presence'].update(data)
        
    collection.find_one_and_replace({'cpf': cpf}, person)