In [1]:
import pandas as pd
import numpy as np
import math
import collections
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [2]:
senate=pd.read_csv("https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/PEJ5QU/XXQCIK", delimiter='\t')
fec=pd.read_csv("FEC_Datasets/senate_candidates.csv")

In [3]:
#This method removes the decimal from the FIPS code and appends the prefix of county 05000US with padding of XXAAA.
def fips_code(x):
    l=[]
    for i in x:
        if i == None:
            l.append(str(i))
        elif math.isnan(i):
            l.append(str(i))
        elif i < 10:
            i = str(int(i))
            l.append("04000US0"+i)
        else:
            i = str(int(i))
            l.append("04000US"+i)
    return l

In [4]:
#Replace all state_fips codes with the full fips code
senate['state_fips'] = fips_code(senate['state_fips'])

#Drop extra columns
senate=senate.drop(['version','state_cen','state_ic','mode'], axis=1)

#Sets all null candidates and Parties to Other
senate.loc[(senate['candidate'].isnull()), 'candidate']='Other'
senate.loc[(senate['party'].isnull()), 'party']='Other'

In [5]:
#Splits FEC dataset by candidate election year
def eyear(nd):
    l=[]
    for index,row in nd.iterrows():
        i=row.values
        t=i[3]
        j=t.strip('{').strip('}').split(',')
        for x in j:
            l.append([i[0],i[1],i[2],int(x),i[4]])
    return l

#Remove excess from FEC dataset
fec.drop(['office','office_full','cycles','party','candidate_status','incumbent_challenge_full','active_through', 'district', 'district_number', 'election_districts', 'incumbent_challenge', 'first_file_date', 'last_file_date','candidate_inactive','last_f2_date','load_date','inactive_election_years'], axis=1, inplace=True)

#Format FEC for fuzzy matching
fec1=pd.DataFrame(eyear(fec))
fec1.columns=["name","party_full","state","year","candidate_id"]
fec1=fec1[(fec1["year"]>=1976)&(fec1["year"]<=2018)]

In [6]:
#Merges all insignificant candidates for my pleasure... (Done by the percentage of votes recieved and name matching)
#C is the list of candidates
#df is the main dataframe
def merge_insig(d, df):
    keep=[]
    other=[]
    for candidate in d.keys():
        if(d[candidate]==''):
            kept=False
            if candidate == 'other':
                other.append(candidate)
            else:
                location= df.loc[(df['candidate'].str.lower().replace(",",'')==candidate)]
                byyear = location.groupby('year')
                cvote = byyear.sum()['candidatevotes']
                tvote = byyear.sum()['totalvotes']
                for year in cvote.index:
                    percentage = cvote[year]/tvote[year]*100
                    if percentage > 5:
                        keep.append(candidate)
                        kept=True
                        break
            if kept==False:
                other.append(candidate)
                df.loc[(df['candidate'].str.lower()==candidate), 'candidate']='Other'
    return (keep, other)

#Remove extra punctuation from the candidate name and reverse the order of names
def modify(x):
    sl=x.replace('"','').split(',')
    sl.reverse()
    s=""
    for i in sl:
        s=s+i+" "
    return s.strip()
#provides subsequent string matches
def subsequence(s1,s2,m,n):
    b=0
    a=0
    while b<m and a<n:
        if s1[b]==s2[a]:
            b+=1
        a+=1
    return b==m


#first pass on the data matching to compare for direct/subsequent string matching
def fpass(x,y):
    #x is a list of candidate from fec and y is a list of candidates from mit
    #output will be dictionary
    l=collections.defaultdict(list)
    for i in y:   
    #     if i not in x:
    #         l[i].append("")
    #         continue
        f=0
        for j in x:
# if i is equal to j we append it to the list of the hashmap for the name in the mit data
            if i==j:
                l[i].append(j)
                break
# if they are similar we merge all the possible outcomes and use fuzz ratio technique to find the best match out of it
            elif subsequence(i,j,len(i),len(j)) or subsequence(j,i,len(j),len(i)):
                l[i].append(j)
            else:
                f+=1
        # if it found nowhere in the fec data we add ""(blank string) to the dictionary
        if f>=len(x):
            l[i].append("")
    return l


#final pass using fuzzywuzzy to find best matches for non perfectly matched names
def out(a,x):
    # a is the dictionary of the output from the fpass
    compare=[]
    for key,value in a.items():
        # check for match with at least 90% confidence. If none exists, insert null value
        if len(value)==1:    
            if value[0]=="":
                t=process.extractOne(key,x)
                if t[1]>=90:
                    compare.append([key,t[0]])
                else:
                    compare.append([key,""])
            else:
                compare.append([key,value[0]])
        else:
            s=""
            m=-1
            # if there is already a potential match we just loook for the best from the posible match found using the previous logic
            for i in value:
                if fuzz.ratio(key,i)>m and fuzz.ratio(key,i)>=90:
                    m=fuzz.ratio(key,i)
                    s=i
            compare.append([key,s])
    return compare


# creating the final dictionary
def result(b):
    # b is the list of all the canidate match from the prvious outputs
    hm=collections.defaultdict(str)
    for i in b:
        if i[0] not in hm or hm[i[0]]=="":
            hm[i[0]]=i[1]
    return hm


#provides a list of candidate ids that succesfully matched between MIT and FEC data
#df is the match dataframe of [MIT Candidate | FEC candidate]
#ids is the dictionary of {FEC name : FEC id}
def append_ids(df, ids):
    id_list=[]
    for index, row in df.iterrows():
        if row['FEC data'] == "other":
            id_list.append("S99999999")
        else:
            if len(row['FEC data'])==0:
                id_list.append("S99999999")
            else: 
                id_list.append(ids[row['FEC data']])
    return id_list


#Swaps candidate names with IDs in the main df
#c is the list of candidate names from the main df
#match is the dictionary of names to IDs made with match_fuzzily
def replace_name(df, match, missed):
    id_list=[]
    for index, row in df.iterrows():
        candidate = row['candidate'].replace('\\','').replace('"','').lower()
        if candidate == "other":
            id_list.append("S99999999")
        elif candidate in missed:
            id_list.append("MISSED")
        else:
            if len(row['candidate'])==0:
                id_list.append("S99999999")
            else:
                matchid = match[(match['MIT data']==candidate)].iloc[0]['fec_id']
                id_list.append(matchid)
    return id_list
        


In [7]:
#Generate name : id dictionary
ids={}
for i in fec1['name'].unique():
    ids.update({modify(i).lower() : fec1[(fec1['name']==i)].iloc[0]['candidate_id']}) 

#Match fuzzily the names of candidates by year
final_l=[]
for year in range(1976,2020,2):
    x=[modify(i).lower() for i in fec1.loc[(fec1["year"]==year),"name"].unique()]
    y=[i.replace('\\','').replace('"','').lower() for i in senate.loc[(senate["year"]==year),"candidate"].unique()]
    z=fpass(x,y)
    final_l=final_l+out(z,x)
final_compare=result(final_l)

In [8]:
#Convert to a dataframe
results=pd.DataFrame(list(final_compare.items()),columns=["MIT data","FEC data"])
id_df = pd.DataFrame(list(ids.items()),columns=["candidate","id"])

#Append a list of IDs matched to FEC names that successfully matched to MIT names
results.loc[:, 'fec_id'] = pd.Series(append_ids(results, ids))

In [9]:
#Create a list of the insignificant candidates that were missed and a list of the important candidates missed
merged = merge_insig(final_compare, senate)
missed = merged[0]
other = merged[1]
print("The number of missed candidates that are more than 5% of the vote: " + str(len(missed)) 
      + "\nThe number of missed candidates that are less than 5% of the vote: " + str(len(other))
      + "\nThe total number of candidates in this dataset is: " + str(len(final_compare)))

The number of missed candidates that are more than 5% of the vote: 195
The number of missed candidates that are less than 5% of the vote: 197
The total number of candidates in this dataset is: 2225


In [10]:
#Replace candidates in MIT dataset with FEC_ID
senate['candidate'] = replace_name(senate, results, missed)

In [12]:
#Export to CSV
#MIT_name : FEC_name : FEC_id
results.to_csv("ExportedCSV\Senate\MIT_fec_senate_90%CI.csv")

#FEC_name : FEC_id
id_df.to_csv("ExportedCSV\Senate\id_candidate.csv")

#MIT Data cleaned with candidate name replaced with FEC_id
senate.to_csv("ExportedCSV\Senate\Senate_Master.csv")