## Generate affiliation data from stage 1 identities

## Used for development. Final code is intergrated with stage1.py and Affiliations.py 

In [15]:
from disambiguation.core import Affiliations
from disambiguation.core import Database
import igraph as ig

In [17]:
class AffiliationAnalyzerUndirectedPostStage1(Affiliations.AffiliationAnalyzerUndirected):
    '''
    Subclass of AffiliationAnalyzerUndirected. Generates a similar affiliation graph,
    but instead of the high-certainty matches found pre-stage1, it uses the stage1
    identities to link affiliation identifiers. For this, we need to override the
    L{load_data()} method so it loads the records using an FECretriever.
    Must set L{self.G}, L{self.dict_record_nodes}.
    '''
    def get_records(self):
        '''
        Retrieve all records for given state.
        '''
        state = self.state
        table_name = state + "_combined"
        retriever = Database.FecRetriever(table_name = table_name)
        retriever.retrieve()
        self.list_of_records = retriever.getRecords()
        
    def load_data(self):
        '''
        For L{self.extract} to work, we need at the minimum two 
        pieces of data: L{self.dict_record_nodes} as a container for
        all the relevant record data, and L{self.contributors_subgraphs}
        which is a list of graphs each one a set of vertices representing
        the records belonging to a stage1 identity.
        '''
        self.load_settings('')
        self.get_records()
        self.load_record_nodes()   
        self.load_identities()
#         self.record_edge_list = json.load(file_adjacency)
#         self.dict_record_nodes = json.load(file_nodes)
#         self.G = igraph.Graph.TupleList(edges=self.record_edge_list)

    
    def load_identities(self):
        '''
        Load L{self.contributors_subgraphs} from the calculated
        stage1 identities. Then create L{self.contributors_subgraphs}
        '''
        try:
            idm = self.idm
        except:
            raise Exception('You must set self.idm to an IdentityManager instance')
        
        self.contributors_subgraphs = []
        # Iterate through all identities, for each one
        # create a Graph with each vertex named after
        # str(r_id).
        for identity, list_ids in idm.dict_identity_2_list_ids.iteritems():
            g = ig.Graph()
            g.add_vertices([str(x) for x in list_ids])
            self.contributors_subgraphs.append(g)
            
        
    
    def load_record_nodes(self):
        '''
        Populate L{self.dict_record_nodes} with important fields
        from all retrieved records.
        '''        
        self.dict_record_nodes = {}
        dict_index = self.settings['field_2_index']
        for record in self.list_of_records:
            record_data = {}
            amount_index = dict_index['TRANSACTION_AMT']
            record_data[amount_index] = record['TRANSACTION_AMT']

            committee_index = dict_index['CMTE_ID']
            record_data[committee_index] = record['CMTE_ID']
            
            employer_index = dict_index['EMPLOYER']
            record_data[employer_index] = record['EMPLOYER']

            occupation_index = dict_index['OCCUPATION']
            record_data[occupation_index] = record['OCCUPATION']
            
            date_index = dict_index['TRANSACTION_DT']
            record_data[date_index] = record['TRANSACTION_DT']

            self.dict_record_nodes[str(record.id)] = {'data': record_data}

            
            
    def set_idm(self,idm):
        '''
        Assign an IdentityManager to this instance.
        '''
        self.idm = idm
        
       
   
    
    def load_settings(self, file_label):
        '''
        Define the L{self.settings} instance that defines the
        schema of the nodes dictionary. This is completely arbitrary:
        the same settings will be used to generate dict_records_nodes,
        and to interpret it.
        @param file_label: dummy. 
        '''
        self.settings = {'field_2_index' : {'TRANSACTION_DT' : 0,
                                            'TRANSACTION_AMT' : 1,
                                            'CMTE_ID' : 2,
                                            'EMPLOYER': 3,
                                            'OCCUPATION': 4}
                        }
        
state = 'massachusetts'
affiliation = 'occupation'

In [20]:

# affiliation = 'occupation'

# counter = 0
# for identity, list_ids in idm.dict_identity_2_list_ids.iteritems():
#     counter += 1
#     print identity, list_ids
#     if counter == 10 : break

a = AffiliationAnalyzerUndirectedPostStage1(state = state, affiliation = affiliation)
a.set_idm(idm)

a.load_data()

a.extract()
a.compute_affiliation_links()
a.save_data(label = state)

select id,CMTE_ID,CAND_ID,CMTE_PTY_AFFILIATION from committee_master WHERE CMTE_TP in ('H','S','P') and CMTE_DSGN='P'  limit 0,100000000 ;
select NAME,CONTRIBUTOR_ZIP,ZIP_CODE,CONTRIBUTOR_STREET_1,CITY,STATE,EMPLOYER,OCCUPATION,TRANSACTION_DT,TRANSACTION_AMT,CMTE_ID,ENTITY_TP,id from massachusetts_combined;
Number of node names not found in self.dict_record_nodes:  0
number of links to save  64934
16797 636431
Saved affiliation graphs to file...


In [45]:
print len(idm.dict_identity_2_list_ids)
print len(a.list_of_records)

3723092
388220


In [7]:
sum([g.vcount() for g in a.contributors_subgraphs])

14004101

# Load IdentityManager 

In [18]:
idm = Database.IdentityManager(state)
idm.fetch_dict_identity_2_id()  
print " fetched"


Table 'identities' exists.
Table 'identities_adjacency' exists.
select id,identity from identities WHERE identity like '%MA%' ;
 fetched
