# Extract a network of interconnections between a set of candidates

### We have a list of candidates. Some may be associated with one or more committees. 


In [6]:
from  disambiguation.core import Database
import igraph as ig
import numpy as np
import pandas as pd

In [50]:
idm = Database.IdentityManager('USA')
idm.fetch_dict_id_2_identity()

Table 'identities_v5' exists.
Table 'identities_adjacency_v5' exists.
Table 'linked_identities_v5' exists.
select id,identity from identities_v5;


## 1- find the committees associated with each candidate

In [10]:
datadir = '/nfs/home/navid/data/FEC-prepare/data/'
columns = ['CAND_ID', 'CAND_ELECTION_YR','FEC_ELECTION_YR','CMTE_ID','CMTE_TP','CMTE_DSGN','LINKAGE_ID']

dict_cand_comm_data = {}

for year in range(0,18,2):
    filename = '{}ccl{}.txt'.format(datadir,str(year).rjust(2,'0'))
    print filename
    data = pd.read_csv(filename,sep = '|',names = columns)
    dict_cand_comm_data[year] = data


/nfs/home/navid/data/FEC-prepare/data/ccl00.txt
/nfs/home/navid/data/FEC-prepare/data/ccl02.txt
/nfs/home/navid/data/FEC-prepare/data/ccl04.txt
/nfs/home/navid/data/FEC-prepare/data/ccl06.txt
/nfs/home/navid/data/FEC-prepare/data/ccl08.txt
/nfs/home/navid/data/FEC-prepare/data/ccl10.txt
/nfs/home/navid/data/FEC-prepare/data/ccl12.txt
/nfs/home/navid/data/FEC-prepare/data/ccl14.txt
/nfs/home/navid/data/FEC-prepare/data/ccl16.txt


In [22]:
# set of candidates we're interested in:
filename =  '/nfs/home/navid/data/Oren/names_for_oren.txt'
data_names = pd.read_csv(filename, sep='|', header=None)
set_cands = set(data_names[0].values)


set_cands

{'H0AL02087',
 'H0AL05163',
 'H0AL07086',
 'H0AR01083',
 'H0AR03055',
 'H0AZ01184',
 'H0AZ01259',
 'H0CA10073',
 'H0CA10149',
 'H0CA19173',
 'H0CA27085',
 'H0CA32101',
 'H0CA33117',
 'H0CA48024',
 'H0CA49055',
 'H0CO04122',
 'H0CT03072',
 'H0DC00058',
 'H0DE01017',
 'H0FL04066',
 'H0FL05139',
 'H0FL08208',
 'H0FL12101',
 'H0FL17068',
 'H0FL18025',
 'H0FL19080',
 'H0GA07125',
 'H0GA07133',
 'H0GA08099',
 'H0IL05096',
 'H0IL10120',
 'H0IL10302',
 'H0IL11052',
 'H0IL14080',
 'H0IN02190',
 'H0IN03198',
 'H0IN08114',
 'H0IN09070',
 'H0KS03137',
 'H0KY05015',
 'H0KY06104',
 'H0LA01087',
 'H0MA10082',
 'H0MI07093',
 'H0MN04049',
 'H0MO04086',
 'H0MO06073',
 'H0MO07113',
 'H0MS04120',
 'H0NC02059',
 'H0ND01026',
 'H0NH01217',
 'H0NH02181',
 'H0NY02085',
 'H0NY20095',
 'H0OH06189',
 'H0OH08029',
 'H0OH12062',
 'H0OH16097',
 'H0OK05114',
 'H0PA04220',
 'H0PA06076',
 'H0PA07082',
 'H0RI01073',
 'H0RI02139',
 'H0SC04257',
 'H0SC05031',
 'H0SD00054',
 'H0TN03254',
 'H0TN04195',
 'H0TN06257',
 'H0TX

In [32]:
dict_cand_2_comm = {}
dict_comm_2_cand = {}

for year, df  in dict_cand_comm_data.iteritems():
    print year
    for i, row in df.iterrows():
        cand = row['CAND_ID']
        comm = row['CMTE_ID']
        designation = row['CMTE_DSGN']
        if designation not in ['P']: continue
        try:
            dict_cand_2_comm[cand].add(comm)
        except:
            dict_cand_2_comm[cand] = set([comm])
        
        try:
            dict_comm_2_cand[comm].add(cand)
        except:
            dict_comm_2_cand[comm] = set([cand])

            

0
2
4
6
8
10
12
14
16


In [45]:
print "Number of candidates with at least one principal committee: ", len(dict_cand_2_comm)
print "Number of candidates of interes: ", len(set_cands)
print "Number of candidates of interest with at least one principal committee: ",\
    len(set(dict_cand_2_comm.keys()).intersection(set_cands))


Number of candidates with at least one principal committee:  13048
Number of candidates of interes:  464
Number of candidates of interest with at least one principal committee:  459


### Set of candidates that have at least one principal campaign PAC 

In [48]:
set_cands_good = set(dict_cand_2_comm.keys()).intersection(set_cands)
list_cands_good = sorted(set_cands_good)

1- Load all records in the desired date range
2- For each record, note the compound identity and the recipient candidate and the amount.
3- Build a dict {(identity, candidate): amount} and update with each new record.

In [55]:
retriever = Database.FecRetriever(table_name = 'usa_combined_v2')

date1 = '2013-01-01'
date2 = '2014-12-30'

list_fields = ['TRANSACTION_DT', 'TRANSACTION_AMT', 'CMTE_ID', 'id']
list_fields_str = ','.join(list_fields)
query = 'SELECT {fields} from usa_combined_v2 where TRANSACTION_DT BETWEEN "{date1}" AND "{date2}"'.format(date1 = date1,
                                                                                                        date2 = date2,
                                                                                                       fields=list_fields_str)
print query
retriever.query_fields = list_fields
retriever.retrieve(query=query)
list_records = retriever.getRecords()

SELECT TRANSACTION_DT,TRANSACTION_AMT,CMTE_ID,id from usa_combined_v2 where TRANSACTION_DT BETWEEN "2013-01-01" AND "2014-12-30"


In [56]:
len(list_records)

2135680

### Loop through the list of records 

In [80]:
outdir = '/nfs/home/navid/data/Oren/'

def get_compound_identity(identity, idm):
    '''
    Return the compound identity of identity by concatenating
    the identities of all linked identities.
    '''
    if identity is None:
        return ''
    linked_identities = idm.get_linked_identities(identity)
    
    all_identities = linked_identities + [identity]
    compound_identity = "|".join(sorted(all_identities)) 
    return compound_identity


outfile = outdir + "perdon_to_candidate_records.csv"
with open(outfile, 'w') as f:
    for i, record in enumerate(list_records):
        cmte = record['CMTE_ID']
        try:
            set_cand = dict_comm_2_cand[cmte]
        except:
    #         print "error"
            continue
        n = len(set_cand)
        list_cands = list(set_cand)



        date = record['TRANSACTION_DT']
        amount = record['TRANSACTION_AMT']
        identity = get_compound_identity(idm.get_identity(rid), idm)
        rid = record.id
        for cand in list_cands:
            if cand not in set_cands_good: continue
            f.write('{},{},{},{},{},{}\n'.format(rid, date, amount/n, cmte, cand, identity))
#         if i > 1000: break
    

In [72]:
dict_comm_2_cand['C00462143']

{'H0AL02087'}

## Now, read the CSV file and use it to compute the similarity matrix 

In [84]:
outdir = '/nfs/home/navid/data/Oren/'

filename = outdir + "perdon_to_candidate_records.csv"
columns = ['id', 'date', 'amount', 'cmte_id', 'cand_id', 'identity']
df = pd.read_csv(filename, names = columns, sep=',')


In [88]:
# Set tof all (compound) identities found in the df
set_identities = set(df['identity'])

In [90]:
# dict: {identity: set([cands])}
dict_identity_2_candidates = {}
for i, row in df.iterrows():
    cand = row['cand_id']
    identity = row['identity']
    amount = row['amount']
    try:
        dict_identity_2_candidates[identity].add(cand)
    except:
        dict_identity_2_candidates[identity] = set([cand])
        
    
    

In [92]:
edgelist = {}
for identity, set_cands in dict_identity_2_candidates.iteritems():
    for cand1 in set_cands:
        for cand2 in set_cands:
            if cand1 >= cand2: continue
            edge = tuple(sorted([cand1, cand2]))
            try:
                edgelist[edge] += 1
            except:
                edgelist[edge] = 1
                
            
            
                

In [95]:
edgelist
print len(edgelist)

56457


In [100]:
edgelist_flat = [(key[0], key[1], value) for key, value in edgelist.iteritems() ]
g = ig.Graph.TupleList(edgelist_flat, weights = True)


In [103]:
g.es.attributes()

['weight']

In [105]:
g.write_ncol(outdir + 'similarity_1.edges',)

## Prune the graph 

In [109]:
import networks as nt
import matplotlib.pyplot as plt

In [107]:
nt.compute_significance(g)

Hello


In [None]:
plt.hist(g.es['significance'], 100)