# Identifying cohabiting contributors 

In this exercise, we will look throughout the database and try to identify households where multiple individuals have made contributions while cohabiting.

To do this, we will do the following:

    1- Build a dictionary of all normalized addresses. Associate every address with the set of all its identities.
    2- For those with multiple identities, do further processing.





Problem: the normalized attributes file doesn't contain normalized addresses.

In [92]:
import pandas as pd
from disambiguation.core import Database
from disambiguation.core import Person

from disambiguation.core import utils
from disambiguation import config

In [19]:
idm = Database.IdentityManager('USA')
idm.table_name_identities
idm.fetch_dict_id_2_identity()

Table 'identities_v5' exists.
Table 'identities_adjacency_v5' exists.
Table 'linked_identities_v5' exists.
select id,identity from identities_v5;


In [43]:
idm.fetch_dict_identity_2_id()

select id,identity from identities_v5;


In [20]:
retriever = Database.FecRetriever('usa_combined_v2',
                                  query_fields=['id', 'NAME','CONTRIBUTOR_STREET_1', 'ZIP_CODE', 'TRANSACTION_AMT']
                                 , where_clause=' WHERE CONTRIBUTOR_STREET_1 IS NOT NULL ')

In [21]:
retriever.retrieve()

In [24]:
list_records = retriever.getRecords()
print len(list_records)

11508004


In [34]:
dict_addresses = {}


for r in list_records:
    address, zipcode = r['CONTRIBUTOR_STREET_1'], r['ZIP_CODE']
    address_full = (zipcode[:5], address)
    rid = r.id
    try:
        dict_addresses[address_full].append(rid)
    except:
        dict_addresses[address_full] = [rid]

In [35]:
len(dict_addresses)

4025475

In [64]:
dict_identities = {}
for address, list_rids in dict_addresses.iteritems():
    set_identities = {idm.get_compound_identity(rid)[0] for rid in list_rids}
    if len(set_identities) > 1:
        dict_identities[address] = set_identities
    
    

In [93]:
print len(dict_identities)

541894


In [94]:
print idm.get_ids('OH-34442')
print idm.get_ids('GA-36500')

[10639568]
[8305739, 8486565, 8839925, 8977749, 11969509, 13439119]


In [95]:
print idm.get_ids('CA-459207')
print idm.get_ids('CA-459208')

[4302189, 5809001, 6407902, 7695132]
[4302191, 14867480]


In [133]:
record_retriever = Database.FecRetrieverByID('usa_combined_v2')
def load_compound_person(compound_identity):
    '''
    Given a compound identity, load all its records and
    generate a Person object from them.
    '''
    s = compound_identity
    list_rids = [rid for identity in s.split('|') for rid in idm.get_ids(identity)]
#     f.write(" ".join([str(rid) for rid in list_rids]) + "\n")
    record_retriever.retrieve(list_rids)
    list_records = record_retriever.getRecords()
    
    p = Person.Person(list_records)
    return p



In [135]:
counter = 0

# Number of cases where there are at least two people 
# with the same last name in the same household.
counter_2_people_same_last = 0

counter_2_people = 0
# with open('tmp.txt', 'w') as f:
print "navid"
for address, set_identities in dict_identities.iteritems():
#     print address, set_identities
    if len(set_identities) > 10: continue

    list_names = []
    for compound_identity in set_identities:
        try:
            p = load_compound_person(compound_identity)
            name = p.get_dominant_attribute('NAME')
#             name = 'ds'
            if name: 
                list_names.append(name)
        except Exception as e:
            print "ERROR OCCURRED", e
            
    if counter % 1000 == 0:
        print counter
    
    try:
        lastnames = set([utils.splitname(name)[0] for name in list_names])
    except: 
        print "ERORORORORORORORORORRORORO"
        break
    if len(list_names) == 2: 
        counter_2_people += 1

    if len(lastnames) < len(list_names):
        counter_2_people_same_last += 1

#     print list_names

    if counter > 10000000:
        break
    counter += 1

print counter
print counter_2_people
print counter_2_people_same_last

navid
0
inserting...
done.
1000
2000
3000
4000
5000
6000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
7000
8000
inserting...
done.
9000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
10000
inserting...
done.
11000
12000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
13000
14000
15000
16000
17000
18000
inserting...
done.
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
31000
32000
33000
inserting...
done.
34000
35000
36000
37000
38000
39000
40000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
41000
42000
43000
44000
inserting...
done.
45000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
46000
47000
inserting...
done.
48000
49000
50000
51000
52000
53000
54000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
55000
56000
57000
58000
inserting...

  cur.execute(query)
  cur.execute(query)



done.
inserting...
done.
59000
60000
61000
62000
inserting...
done.
63000
64000
65000
66000
67000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
68000
69000
70000
71000
72000
73000
74000
75000
inserting...
done.
76000
77000
78000
79000
80000
81000
82000
83000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
84000
inserting...
done.
85000
86000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
87000
inserting...
done.
88000
89000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
90000
inserting...
done.
91000
92000
93000
94000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
95000
96000
97000
inserting...
done.
98000
99000

  cur.execute(query)
  cur.execute(query)



inserting...
done.
100000
100001
78638
64719


In [89]:
utils.splitname('CROCKETT, VICTORIA')

('CROCKETT', '', 'VICTORIA')