# Identifying cohabiting contributors 

In this exercise, we will look throughout the database and try to identify households where multiple individuals have made contributions while cohabiting.

To do this, we will do the following:

    1- Build a dictionary of all normalized addresses. Associate every address with the set of all its identities.
    2- For those with multiple identities, do further processing.





Problem: the normalized attributes file doesn't contain normalized addresses.

In [73]:
import pandas as pd
from disambiguation.core import Database
from disambiguation.core import Person

from disambiguation.core import utils
from disambiguation import config

In [19]:
idm = Database.IdentityManager('USA')
idm.table_name_identities
idm.fetch_dict_id_2_identity()

Table 'identities_v5' exists.
Table 'identities_adjacency_v5' exists.
Table 'linked_identities_v5' exists.
select id,identity from identities_v5;


In [43]:
idm.fetch_dict_identity_2_id()

select id,identity from identities_v5;


In [20]:
retriever = Database.FecRetriever('usa_combined_v2',
                                  query_fields=['id', 'NAME','CONTRIBUTOR_STREET_1', 'ZIP_CODE', 'TRANSACTION_AMT']
                                 , where_clause=' WHERE CONTRIBUTOR_STREET_1 IS NOT NULL ')

In [21]:
retriever.retrieve()

In [24]:
list_records = retriever.getRecords()
print len(list_records)

11508004


In [34]:
dict_addresses = {}


for r in list_records:
    address, zipcode = r['CONTRIBUTOR_STREET_1'], r['ZIP_CODE']
    address_full = (zipcode[:5], address)
    rid = r.id
    try:
        dict_addresses[address_full].append(rid)
    except:
        dict_addresses[address_full] = [rid]

In [35]:
len(dict_addresses)

4025475

In [64]:
dict_identities = {}
for address, list_rids in dict_addresses.iteritems():
    set_identities = {idm.get_compound_identity(rid)[0] for rid in list_rids}
    if len(set_identities) > 1:
        dict_identities[address] = set_identities
    
    

In [66]:
print len(dict_identities)

541894


In [61]:
print idm.get_ids('OH-34442')
print idm.get_ids('GA-36500')

[10639568]
[8305739, 8486565, 8839925, 8977749, 11969509, 13439119]


In [60]:
print idm.get_ids('CA-459207')
print idm.get_ids('CA-459208')

[4302189, 5809001, 6407902, 7695132]
[4302191, 14867480]


In [74]:
record_retriever = Database.FecRetrieverByID('usa_combined_v2')
def load_compound_person(compound_identity):
    '''
    Given a compound identity, load all its records and
    generate a Person object from them.
    '''
    s = compound_identity
    list_rids = [rid for identity in s.split('|') for rid in idm.get_ids(identity)]
    record_retriever.retrieve(list_rids)
    list_records = record_retriever.getRecords()
    
    p = Person.Person(list_records)
    return p



In [80]:
counter = 0
for address, set_identities in dict_identities.iteritems():
#     print address, set_identities
    if len(set_identities) > 10: continue
    
    list_names = []
    for compound_identity in set_identities:
        p = load_compound_person(compound_identity)
        list_names.append(p.get_dominant_attribute('NAME'))
    print list_names
        
    if counter > 100:
        break
    counter += 1
    

[u'SEGAL, ADRIEN', u'SEGAL, RICHARD']
[u'REYNOLDS, SHEILA', u'REYNOLDS, MATTHEW']
[u'ANTONOPOULOS, ROSEMARY', u'JACKSON, SCOTT']
[u'KRISHNAN, ATHY', u'KRISHNAN, SEETHA']
[u'NELSON, ELLIOTT', u'OLSEN, JEREMY']
[u'MCAMIS, JOHN', u'MCAMIS, RENEE']
[u'MICHALSKI, BONETTE', u'MICHALSKI, ROBERT']
[u'SEIDEL, SAM', u'SCHWESER, JAMIE']
[u'MATINA, JOSEPH MR.', u'KURUCZ, JONNA S', u'ROBINSON, TRACY']
[u'QUINN, JODY K.', u'QUINN, WILLIAM J.']
[u'SAITO, REID SAITO K', u'SAITO, REID']
[u'VALIOTIS, EFSTATHIOS', u'VALIOTIS, STAMATIKI']
[u'MOTTO, DANIEL D', u'MOTTO, DIANNE']
[u'SCHOTT, BARTON', u'SCHOTT, BART']
[u'MALAKZAD, NOOSHIN', u'MALAKZAD, JOHN  ']
[u'SLADE, DAVID MR.', u'STANTON, MARY BETH', u'CANNON, W STEPHEN']
[u'HARDY, PETER', u'FRANKEL, FRED']
[u'WOJCIECH, JOHN MR.', u'WOJCIECH, JOEY MR.']
[u'RON, HOWSE', u'HOWSE, RONALD']
[u'WESTBROOK, MARSTON DR.', u'FRITZ, RONALD M. D.O., F.A.']
[u'PARDEE, VERA', u'PARDEE, JOHN', u'HARMON, AMY']
[u'LONG, JIM', u'LONG, DEBORAH DEBERRY MS.']
[u'ANGERMAN, DO