This notebook runs the SF election data preparation and analysis

In [24]:
import pandas as pd, numpy as np
import os, re
import openpyxl
from xlrd import open_workbook
from pprint import pprint
from datetime import date

## Extract data from election result .xls files

In [25]:
root = '/Users/lisarayle/Dropbox/sf_data/elections/'
os.chdir(root)
# change for maybes
path = 'SOV/SOV_w_nimby/'

# read all files
# keep data in a dictionary, where keys are election date (e.g., 200111)
file_list = os.listdir(path)
vote_data = {}
for f in file_list:
    d=f.strip('SOV').strip('.xls')
    d=d[:-2]
    if d[0]=='9':
        d='19'+d
    else:
        d='20'+d
    vote_data[d]={}
    vote_data[d]['filename'] =f


In [26]:


# define function that will search for the right worksheet and return their names as a list.
# wb is workbook. to_match = phrase to match, to_not_match = phrase to exlude
def find_matching_sheets(wb, to_match, to_not_match):
    sheets =[]
    for s in wb.sheets():
        s_match = re.search(to_match,s.name, flags=re.IGNORECASE)
        not_match = re.search(to_not_match,s.name, flags=re.IGNORECASE)
        if s_match:
            if not not_match:
                #print(s_match.string)
                sheets.append(s_match.string)
    return sheets

In [27]:
# define regex search term that will match name of worksheet with local propositions
phrase = 'prop|meas'
phrase_state = 'state|st'

for d in vote_data.keys(): 
    f=vote_data[d]['filename']
    # first only do .xls files.. because need different package to read these.
    if f.split('.')[1]=='xls':      
        #print(f)
        wkbk=open_workbook(path+f)
        
        # Find the sheet(s) with local ballot proposals. 
        sheets_w_props=find_matching_sheets(wkbk, to_match=phrase, to_not_match=phrase_state)
        vote_data[d]['sheet_names'] = sheets_w_props

                
# Here are ones with weird formats. 
vote_data['199711']['sheet_names']=['A - D','E - F']
vote_data['199706']['sheet_names']=['970603']
vote_data['199911']['sheet_names']=['E to H','I to K']

vote_data['201411']['sheet_names']=['370 - Local Measure F']
vote_data['201511']['sheet_names']=['180 - Local Measure D','205 - Local Measure I']



In [28]:
# now find the data within each sheet
# add a list of ballot props we want for each election:

# change this excel file for maybes
proposals = pd.read_excel('BallotPropositions_nimby2.xlsx')  
proposals.head()
proposals['Year_str']=proposals['Year'].astype(str)
proposals['Mo_str']=proposals['Month2'].astype(str)

for i,d in enumerate(proposals['Mo_str']):
    if len(d)==1:
        d='0'+d
    proposals.loc[i,'Mo_str']=d

proposals['Date_str']=proposals['Year_str']+proposals['Mo_str']
#proposals.head()

for d in vote_data.keys():
    p_list=list(proposals[proposals['Date_str']==d]['Letter'].values)
    vote_data[d]['props'] = dict.fromkeys(p_list,{})
#vote_data

In [29]:
# Edit sheetnames so only the ones we need are listed
#pprint(vote_data)
vote_data['199711']['sheet_names']=['E - F']

vote_data['199811']['sheet_names']= ['City Prop A-E']

vote_data['200011']['sheet_names']=['Prop K-O']
vote_data['200111']['sheet_names']=['AMENDMENTS']

In [30]:
# find the data for that prop in each sheet

for d in vote_data.keys():
    print(d)
    letters=list(vote_data[d]['props'].keys())
    # we want one sheet for each prop letter

    # I think I should handle them differently if there are multiple props
    # there might be multiple letters, there might be multiple sheets, or both. 
    # but if there's only one letter, there's only one sheet.
    if len(letters) >1:
        # when there are multiple letters, there may be multiple sheets, or just one sheet. 
        # if there are multiple sheets, need to make sure the sheet matches the letter. There are only 3 of these
        # should match these by hand.
        if len(vote_data[d]['sheet_names'])>1:
                print('has multiple sheets:',d)
                
        elif len(vote_data[d]['sheet_names'])<=1:
            # when there are multiple letter but one sheet, easy to match each letter to the one sheet
            #print('has one sheet, multiple letters:',d)
            for l in letters:
                vote_data[d]['props'][l] = {'s_name':vote_data[d]['sheet_names'][0]}

    else:
        # there's only one sheet, which will match the one letter. 
        vote_data[d]['props'][letters[0]]={'s_name':vote_data[d]['sheet_names'][0]}

# where multiple sheets and mult letters, match by hand.
vote_data['201511']['props']['D']={'s_name':'180 - Local Measure D'}
vote_data['201511']['props']['I']={'s_name':'205 - Local Measure I'}
vote_data['199911']['props']['H']={'s_name':'E to H','col_name':'PROP H'}
vote_data['199911']['props']['I']={'s_name':'I to K','col_name':'PROP I'}
vote_data['199911']['props']['J']={'s_name':'I to K','col_name':'PROP J'}
vote_data['201311']['props']['B']={'s_name':'Measure A & B'}
vote_data['201311']['props']['C']={'s_name':'Measure C & D'}

    

200111
200403
201411
201511
has multiple sheets: 201511
199711
200203
200011
199706
200411
200003
200611
199603
199911
has multiple sheets: 199911
201406
199811
201311
has multiple sheets: 201311
200211
199806
200806


In [31]:
# Huge annoying list of custom parameters needed for reading excel.
d ='200011'
l='K'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=1
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,5,6]
vote_data[d]['props'][l]['params']['skip_footer']=56

d ='200011'
l='L'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=1
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,8,9]
vote_data[d]['props'][l]['params']['skip_footer']=56

d ='200111'
l='D'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=1
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,7,8]
vote_data[d]['props'][l]['params']['skip_footer']=70

d ='199911'
l='J'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=4
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,9,10]
vote_data[d]['props'][l]['params']['skip_footer']=51

d ='199911'
l='H'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=4
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,15,16]
vote_data[d]['props'][l]['params']['skip_footer']=51

d ='199911'
l='I'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=4
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,6,7]
vote_data[d]['props'][l]['params']['skip_footer']=51

d ='200203'
l='D'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=1
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,23,24]
vote_data[d]['props'][l]['params']['skip_footer']=60

d ='200806'
l='F'
vote_data[d]['props'][l]['params']={'index_col':0}
vote_data[d]['props'][l]['params']['skiprows']=2
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,24,25]
vote_data[d]['props'][l]['params']['skip_footer']=51

d ='200806'
l='G'
vote_data[d]['props'][l]['params']={'index_col':0}
vote_data[d]['props'][l]['params']['skiprows']=2
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,28,29]
vote_data[d]['props'][l]['params']['skip_footer']=51

d ='199711'
l='H'
vote_data[d]['props'][l]['params']={'index_col':0}
vote_data[d]['props'][l]['params']['skiprows']=3
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,14,15]
vote_data[d]['props'][l]['params']['skip_footer']=42

d ='199603'
l='B'
vote_data[d]['props'][l]['params']={'index_col':0}
vote_data[d]['props'][l]['params']['skiprows']=3
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,8,9]
vote_data[d]['props'][l]['params']['skip_footer']=38

d ='201511'
l='I'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=3
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,4,5,7,8]
vote_data[d]['props'][l]['params']['skip_footer']=54

d ='201511'
l='D'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=3
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,4,5,7,8]
vote_data[d]['props'][l]['params']['skip_footer']=54

d ='200211'
l='R'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=1
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,53,54]
vote_data[d]['props'][l]['params']['skip_footer']=64

d ='200211'
l='B'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=1
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,21,22]
vote_data[d]['props'][l]['params']['skip_footer']=64

d ='201406'
l='B'
vote_data[d]['props'][l]['params']={'index_col':0}
vote_data[d]['props'][l]['params']['skiprows']=0
vote_data[d]['props'][l]['params']['parse_cols']=[6,7,8,14,15]
vote_data[d]['props'][l]['params']['skip_footer']=55

d ='200411'
l='A'
f_name=vote_data[d]['filename']
s=vote_data[d]['props'][l]['s_name']

vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=2
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,5,6]
vote_data[d]['props'][l]['params']['skip_footer']=32

d ='201311'
l='C'
vote_data[d]['props'][l]['params']={'index_col':0}
vote_data[d]['props'][l]['params']['skiprows']=1
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,4,5]
vote_data[d]['props'][l]['params']['skip_footer']=54

d ='201311'
l='B'
vote_data[d]['props'][l]['params']={'index_col':0}
vote_data[d]['props'][l]['params']['skiprows']=1
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,8,9]
vote_data[d]['props'][l]['params']['skip_footer']=54

d ='200611'
l='G'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=1
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,17,18]
vote_data[d]['props'][l]['params']['skip_footer']=33

d ='199706'
l='F'
vote_data[d]['props'][l]['params']={'index_col':0}
vote_data[d]['props'][l]['params']['skiprows']=3
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,20,21]
vote_data[d]['props'][l]['params']['skip_footer']=38

d ='200003'
l='C'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=5
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,12,13]
vote_data[d]['props'][l]['params']['skip_footer']=52

d ='201411'
l='F'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=3
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,4,5,7,8]
vote_data[d]['props'][l]['params']['skip_footer']=54

d ='199811'
l='E'
vote_data[d]['props'][l]['params']={'index_col':0}
vote_data[d]['props'][l]['params']['skiprows']=3
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,17,18]
vote_data[d]['props'][l]['params']['skip_footer']=38

d ='199806'
l='E'
vote_data[d]['props'][l]['params']={'index_col':0}
vote_data[d]['props'][l]['params']['skiprows']=5
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,17,18]
vote_data[d]['props'][l]['params']['skip_footer']=35

d ='199806'
l='K'
vote_data[d]['props'][l]['params']={'index_col':0}
vote_data[d]['props'][l]['params']['skiprows']=5
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,35,36]
vote_data[d]['props'][l]['params']['skip_footer']=35

d ='199806'
l='I'
vote_data[d]['props'][l]['params']={'index_col':0}
vote_data[d]['props'][l]['params']['skiprows']=5
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,29,30]
vote_data[d]['props'][l]['params']['skip_footer']=35

d ='200403'
l='J'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=1
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,33,34]
vote_data[d]['props'][l]['params']['skip_footer']=65

d ='200411'
l='A'
vote_data[d]['props'][l]['params']={'index_col':[0,1]}
vote_data[d]['props'][l]['params']['skiprows']=2
vote_data[d]['props'][l]['params']['parse_cols']=[0,1,2,3,5,6]
vote_data[d]['props'][l]['params']['skip_footer']=65
#Done!!


In [32]:
# function to load a given sheet to a dataframe
# elect_date is the election date string. prop_letter is proposal letter. vote_df is vote_data (optionally modify)
def read_vote_sheet(elect_date,prop_letter,vote_df=vote_data):
    f=vote_data[elect_date]['filename']
    s=vote_data[elect_date]['props'][prop_letter]['s_name']
    params=vote_data[elect_date]['props'][prop_letter]['params']
    df=pd.read_excel(path+f,sheetname=s,index_col=params['index_col'], skiprows=params['skiprows'], parse_cols=params['parse_cols'],skip_footer=params['skip_footer'])
    return df

# when there's only a single index, we need to make it a multiindex.

## This function checks for the format of the index.
# descriptive_labels=True
# when index is like this:
#PCT 1101 1101
#PCT 1101 - Vote By Mail / Absentee Reporting

# descriptive_labels=False
# Applies to date '199603','199706','199711', '199806','199811'. So dates <='199811'
# index looks like this: 
# PCT 2001 8
# PCT 2001 8
# clues are in the 'registered' column--which we ASSUME IS THE FIRST COLUMN!!
# d is election date string
def check_if_descriptive(elect_date):
    if int(elect_date)<=199811:
        result = False
    else:
        result=True
    return result

# turns single index into multiindex
def format_df_to_multiindex(df, descriptive_labels=True):
    ballot_types=[]
    precinct=[]
    for i, val in enumerate(df.index):
        #if s contains something about mail or absentee or vbm:
        if descriptive_labels==True:
            match=re.search('mail|absent|vbm',val,flags=re.IGNORECASE)
            if match:    
                ballot_type='A'
                pct=match.string[:8]
            # if s is not a mail-in ballot
            else:
                match = re.search('pct',val, flags=re.IGNORECASE)
                if match:
                    ballot_type='V'
                    pct=match.string[:8]
            
        if descriptive_labels==False:
            if (df.iloc[i,0]==0)|np.isnan(df.iloc[i,0]):
                ballot_type='A'
            elif df.iloc[i,0]>0:
                ballot_type='V'
            match = re.search('pct',val, flags=re.IGNORECASE)
            if match:
                pct=match.string[:8]
        ballot_types.append(ballot_type)
        precinct.append(pct)
    df['type']=ballot_types
    df['precinct']=precinct
    df.set_index(['precinct','type'],inplace=True)
    return df
# Use:
#desc = check_if_descriptive('199711')
#data=format_df_to_multiindex(data,descriptive_labels=desc)
#data.head()


# this checks if second level of multiindex is already "A" and "V"
def check_if_av_format(df):
    if 'A' in list(df.index.levels[1].values):
        result=True
    elif 'A' not in list(df.index.levels[1].values): 
        result=False
    return result

#  if index levels isn't already named with "A" and "V", this will fix it. 
def index_to_av_format(df):
    ballot_types=[]
    precincts=[]
    for i, val in enumerate(df.index):
        match = re.search('mail|absent|vbm',val[1], flags=re.IGNORECASE)
        if match:
            ballot_type='A'
        else:
            ballot_type='V'
        ballot_types.append(ballot_type)
        precincts.append(val[0])
    df['type']=ballot_types
    df['precinct']=precincts
    df.set_index(['precinct','type'],inplace=True)
    return df

# when dataframe has the format: multiindex, registered, ballots cast, yes, no, 
# need to name index and columns using this function

def rename_index_and_cols(df):
    #seems to be a bug in using set_levels(). need to get around it some other way. 
    #df.index=df.index.set_levels([['precinct','type'],['A,V']],level=[0,1],inplace=False)
    # if index levels isn't already named with "A" and "V", fix it. 
    av_format=check_if_av_format(df)
    if not av_format:
        df=index_to_av_format(df)
    elif av_format:    
        df.index.names=['precinct','type']
    df.columns=['registered','voted','YES','NO']
    return df
# use:
#data=rename_index_and_cols(data)
#data.head()

In [33]:
dlist=list(vote_data.keys())
# read excel files for each election... 
for d in dlist:
    print(d)
    for l in vote_data[d]['props'].keys():
        print(l)
        data=read_vote_sheet(d,l)
        # test if already has multiindex:
        if isinstance(data.index, pd.core.index.MultiIndex): 
            data=rename_index_and_cols(data)
        else:
            desc = check_if_descriptive(d)
            data=format_df_to_multiindex(data, descriptive_labels=desc)
            data=rename_index_and_cols(data)
        vote_data[d]['props'][l]['data']=data
#data.head()

200111
D
200403
J
201411
F
201511
D
I
199711
H
200203
D
200011
K
L
199706
F
200411
A
200003
C
200611
G
199603
B
199911
J
H
I
201406
B
199811
E
201311
C
B
200211
R
B
199806
K
I
E
200806
G
F


## verify totals

In [34]:
# Check sums verify. Some of these are off. 
# For the 90s elections, there are three mysterious "ballot types" at the beginning that don't have a precinct.
# that's what's throwing off the totals. Might be votes for people who don't vote at an address, like prisoners or something.
# So just make sure that, when reporting full results, rely on the original sheet and not the dataframe. 
# otherwise everything checks out. 
            

dates=[]
props=[]
results=[]
for d in vote_data.keys():
    for p in vote_data[d]['props'].keys():
        df=vote_data[d]['props'][p]['data']
        yes_pct=df.YES.sum()/(df.YES.sum()+df.NO.sum())
        results.append(yes_pct)
        props.append(p)
        dates.append(d)
results=pd.DataFrame({'election':dates,'proposal':props,'yes_pct':results})
results.to_csv('verify_percentages.csv')

In [35]:
vote_data.keys()
test_df = vote_data['200403']['props']['J']['data']
test_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,registered,voted,YES,NO
precinct,type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PCT 1101,A,766,122,35,79
PCT 1101,V,766,173,41,121
PCT 1102,A,921,166,41,110
PCT 1102,V,921,251,66,167
PCT 1103,A,783,125,50,59


In [36]:
# This will consolidate the absentee and regular votes for each proposition's data. 
# data is the vote data dictionary

# function to loop through the dictionary of stuff. 
def process_votedata(data, process_func, use_datekey=False, use_propkey=False, *args):
    data_new = data # make a copy. I think this is necessary because otherwise it keeps overwriting when I don't want it to. 
    for d in data_new.keys():
        for p in data_new[d]['props'].keys():
            #print('working on ',d,p)
            df=data_new[d]['props'][p]['data']
            # do some function on df
            if use_datekey==True:
                if use_propkey==True:
                    df_new=process_func(df, date_key=d, prop_key=p, *args)
                else:
                    df_new = process_func(df, date_key=d, *args)
                
            elif use_datekey==False:
                if use_propkey==True:
                    df_new=process_func(df, prop_key=p, *args)
                else:
                    df_new = process_func(df, *args)
            
            #new dictionary copy with new dataframe
            data_new[d]['props'][p]['data'] = df_new
    return(data_new)

In [37]:
def consolidate_abs(df):
    # A couple of them don't use multiindex, are in a different format. 
    
    if isinstance(df.index, pd.core.index.MultiIndex):
        #print(df.head())
        # for each proposition's dataframe, make a new df to hold totals
        new_df = pd.DataFrame(index=df.index.levels[0], columns=['voted','YES','NO'])
        for prec in df.index.levels[0]:
            # calculate totals
            totals = df.loc[prec].sum(axis=0)  
            # FIXED: some of the dataframes have number of registered voters in both the regular and absentee
            # rows. So for those the 'registered' column will be 2x the correct amount. 
            new_df.ix[prec] = totals
        
        # recover registered column - create df with only the pct index and registered column

        df=df.sort_index()
        regist_temp = df.loc[(slice(None),['V']),:'registered']
        regist_temp.index = regist_temp.index.droplevel(1)
        
        # merge this back with the new dataframe I just created. 
        new_df = pd.merge(new_df, regist_temp, left_index=True, right_index=True)
        
    elif isinstance(df.index, pd.core.index.Index):
        # maybe do some other processing
        new_df=df

    return(new_df)

In [38]:
vote_data_test = process_votedata(vote_data, consolidate_abs)
vote_data_test.keys()


dict_keys(['200111', '200403', '201411', '201511', '199711', '200203', '200011', '199706', '200411', '200003', '200611', '199603', '199911', '201406', '199811', '201311', '200211', '199806', '200806'])

In [39]:
vote_data_test['200806']['props']['G']['data'].head()

Unnamed: 0_level_0,voted,YES,NO,registered
precinct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PCT 1101,315,201,90,806
PCT 1102,397,247,115,937
PCT 1103,283,169,90,750
PCT 1104,335,198,112,844
PCT 1105,282,161,92,884


## define nimby variable

In [21]:
# function will look up the value for the nimby variable value, given the election date and proposal letter.
#df_prop is the proposals dataframe
def lookup_nimby_value(date_key, letter, df_prop):
    x = df_prop[(df_prop.Date_str==date_key)&(df_prop.Letter==letter)]['Vote that equals NIMBY'].values[0]
    return(x)
    
# will make two variables, one that is total nimby votes and one that is percent nimby votes. 

def make_vote_variables(df,date_key=d,letter=p, df_prop=proposals):
    nimby_val = lookup_nimby_value(date_key, letter, df_prop)
    df['tot_nimby_votes'] = df[nimby_val]

    # 'voted' is the total ballots received, but some voters may have skipped some ballot items)
    try:
        df['pct_nimby'] = df.tot_nimby_votes/(df.YES + df.NO)  # Pct nimby/ total votes for that proposal
    except(ZeroDivisionError):
        df['pct_nimby'] = np.nan

    # Make turnout variable
    df['turnout'] = df.voted/df.registered
    
    return(df)

In [22]:
vote_data_test2 = process_votedata(vote_data_test, make_vote_variables)

KeyError: 'data'

In [220]:
vote_data_test2['201511']['props']['I']['data'].head()

Unnamed: 0_level_0,voted,YES,NO,registered,tot_nimby_votes,pct_nimby,turnout
precinct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Pct 1101,210,87,109,492,109,0.556122,0.426829
Pct 1102/1103,817,319,466,1748,466,0.593631,0.467391
Pct 1104/1105,376,191,167,863,167,0.46648,0.435689
Pct 1106/1107,571,240,307,1413,307,0.561243,0.404105
Pct 1109/1108,699,284,385,1627,385,0.575486,0.429625


## Match voting data with census data by precinct

In [221]:

# format precinct columns so they match. Correct format is a 4-digit string. 

# what to do with precints that seems to be combined? e.g., 1104/1105? 
# split into two rows, I guess
# This will create a copy of the row values. 
# IMPORTANT: Don't rely on total vote counts. Doesn't matter anyway, it's the percentage that matters. 

def split_prec_rows(df):
    for idx in df.index:
        # look for rows with precincts that need to be split
        if re.search('\d{4}/\d{4}',idx):
            a,b = idx.split('/')
            row_values = df.loc[idx]
            df.loc[a] = row_values
            df.loc[b] = row_values
            
            # delete original row
            df = df.drop(idx, axis=0)
    return(df)

def format_precincts(df):
    print(df.head())
    
    # what to do with precincts marked "mail"? I think these are ones that are not physical places. 
    # I'll just strip off the mail part and if it doesn't match up with a physical prec during merge, then it'll be omitted.
 
    df.index = df.index.str.strip('MAIL').str.strip('mail').str.strip()
    
    # strip unneeded characters
    df.index = df.index.str.strip('PCT').str.strip('Pct').str.strip()
    
    # split double precint rows
    df_new = split_prec_rows(df)
    
    #print(df_new.head())
    return(df_new)
    

In [222]:
vote_data_test3 = process_votedata(vote_data_test2, format_precincts)

               voted  YES   NO  registered  tot_nimby_votes  pct_nimby  \
precinct                                                                 
Pct 1101         210   87  109         492              109   0.556122   
Pct 1102/1103    817  319  466        1748              466   0.593631   
Pct 1104/1105    376  191  167         863              167   0.466480   
Pct 1106/1107    571  240  307        1413              307   0.561243   
Pct 1109/1108    699  284  385        1627              385   0.575486   

                turnout  
precinct                 
Pct 1101       0.426829  
Pct 1102/1103  0.467391  
Pct 1104/1105  0.435689  
Pct 1106/1107  0.404105  
Pct 1109/1108  0.429625  
               voted  YES   NO  registered  tot_nimby_votes  pct_nimby  \
precinct                                                                 
Pct 1101         210  153   52         492               52   0.253659   
Pct 1102/1103    817  558  240        1748              240   0.300752   
Pct

## merge with census data

In [223]:
# load census data

def get_census_data(yr_key):
    censuspath = '/Users/lisarayle/Dropbox/sf_data/census/data_by_precinct/'
    filename = 'census_by_precinct_{}.csv'.format(yr_key)
    df = pd.read_csv(censuspath+filename, dtype={'precname':str})
    return(df)

In [224]:

# function to match election with appropriate year for census dataset and precinct boundaries. 
# can take the year string or date-key string as arguments

# vote_key is the 6-digit election year-month key
def voting2census_key(vote_key):
    if int(vote_key)<=200211: # try changing the cutoff from 200203 to 200211. Yes this is better. 
        census_key = 'ce2000pre1992'
    elif (int(vote_key)>200211)&(int(vote_key)<=200400):  # try changing this from 200203 to 200211
        census_key = 'ce2000pre2002'
    elif (int(vote_key)>200400)&(int(vote_key)<201211):
        census_key = 'ce2007pre2002'
    elif int(vote_key)>=201211:
        census_key = 'ce2012pre2012'
    else: 
        print('year outside of range')
        census_key = None
    return(census_key)


In [225]:
# let's merge!
def merge_vote_census(vote_df, date_key):
    
    census_key = voting2census_key(date_key)
    census_df = get_census_data(census_key)
    #print(census_df.head())
    #print(vote_df.head())
    merged_df = pd.merge(vote_df, census_df, left_index=True, right_on='precname', how='inner')
    # check how it turned out. 
    print('\n for election ', date_key)
    print('length vote data: ', len(vote_df), 'length census data: ', len(census_df))
    print('length new data: ', len(merged_df))
    return(merged_df)

vote_data_test4 = process_votedata(vote_data_test3, merge_vote_census, use_datekey=True)

# All matched up. 


 for election  201511
length vote data:  597 length census data:  603
length new data:  582

 for election  201511
length vote data:  597 length census data:  603
length new data:  582

 for election  200806
length vote data:  561 length census data:  586
length new data:  561

 for election  200806
length vote data:  561 length census data:  586
length new data:  561

 for election  201311
length vote data:  409 length census data:  603
length new data:  409

 for election  201311
length vote data:  409 length census data:  603
length new data:  409

 for election  200411
length vote data:  581 length census data:  586
length new data:  564

 for election  199806
length vote data:  644 length census data:  712
length new data:  643

 for election  199806
length vote data:  644 length census data:  712
length new data:  643

 for election  199806
length vote data:  644 length census data:  712
length new data:  643

 for election  200003
length vote data:  688 length census data:  712

In [226]:
df_test = vote_data_test4['200403']['props']['J']['data']
df_test.columns

Index(['voted', 'YES', 'NO', 'registered', 'tot_nimby_votes', 'pct_nimby',
       'turnout', 'precname', 'area_m', 'tot_hu_wgt', 'occ_hu_wgt',
       'owned_wgt', 'rented_wgt', 'hu_detatched_wgt', 'hu_2_wgt', 'hu_3-4_wgt',
       'hu_5-9_wgt', 'hu_10-19_wgt', 'hu_20-49_wgt', 'hu_50_wgt',
       'families_wgt', 'tot_hhs_wgt', 'tot_pop_wgt', 'white_wgt', 'black_wgt',
       'asian_wgt', 'hispanic_wgt', 'med_yr_built_wgt', 'foreign_born_wgt',
       'med_yr_moved_all_wgt', 'med_yr_moved_owner_wgt', 'med_value_wgt',
       'med_inc_wgt', 'med_age_wgt'],
      dtype='object')

# Prepare data for R, then save

Make a dataset that combines all elections, but make sure to have proposal and year dummies

What dummy variables do we want? 

 -year
 
 -pres election year
 
 -november election
 

In [227]:
# function to make election dummy variables

# TODO: add column with data+proposal code
# I modified process_votedata to add prop code. 

def make_election_dummies(df, date_key, prop_key): 
    year = date_key[0:4]
    mo = date_key[4:6]
    if date_key in ['201211','200811','200411','200011','199611','199211']:
        pres = True
    else:
        pres = False
    if mo=='11':
        nov=True
    else:
        nov=False
    # add dummy variables
    df['yr_'+year] = True
    df['pres_elec']=pres
    df['nov_elec'] = nov
    
    # also add 'year' and year+proposal code variables
    df['year'] = year
    df['yr_prop']= date_key+prop_key
    return(df)


In [228]:
vote_data_test5 = process_votedata(vote_data_test4, make_election_dummies, use_datekey=True, use_propkey=True)


## some more variable calculations

In [229]:
# some variables are not comparable across years: med_value, med_inc, med_yr_moved, med_yr_built

# fix med_yr_built and med_yr_moved
def fix_yr_built_moved(df, date_key):
    year = int(date_key[0:4])
    # median housing unit age ('med_hu_age') = current year - med year built
    df['med_hu_age']=year-df.med_yr_built_wgt
    # median years lived in house ('med_yrs_lived') = current year - med year moved
    df['med_yrs_lived'] = year-df.med_yr_moved_all_wgt
    df['med_yrs_lived_owner'] = year-df.med_yr_moved_owner_wgt
    return(df)

def adjust_inflation(df, date_key):
    # adjust med income and med house value for inflation. 
    year = date_key[0:4]
    if int(date_key)<=200400: 
        # need 1999 -> 2014    $1 in 1999 = $1.42 in 2014
        r = 1.42
    elif (int(date_key)>200400)&(int(date_key)<201211):
        # use 2012 -> 2014
        r = 1.03
    elif int(date_key)>=201211:
        # keep in 2014 
        r = 1
    else: 
        print('year outside of range')
        r = 1
    # calculate inflation-adjusted values for income and house value.
    df['med_inc_adj'] = df.med_inc_wgt*r
    df['med_val_adj'] = df.med_value_wgt*r
    
    return(df)
    

In [230]:
vote_data_test6 = process_votedata(vote_data_test5, fix_yr_built_moved, use_datekey=True)
vote_data_test7 = process_votedata(vote_data_test6, adjust_inflation, use_datekey=True)

In [231]:
df_test = vote_data_test7['200403']['props']['J']['data']
df_test.head()

Unnamed: 0,voted,YES,NO,registered,tot_nimby_votes,pct_nimby,turnout,precname,area_m,tot_hu_wgt,...,yr_2004,pres_elec,nov_elec,year,yr_prop,med_hu_age,med_yrs_lived,med_yrs_lived_owner,med_inc_adj,med_val_adj
0,295,76,200,766,200,0.724638,0.385117,1101,2696087.189278,542.163803,...,True,False,False,2004,200403J,64.80687,8.188377,8.742502,103635.537362,692651.358909
1,417,107,277,921,277,0.721354,0.452769,1102,1073412.658539,422.709863,...,True,False,False,2004,200403J,63.226691,8.327617,8.984652,93123.572423,701135.973615
2,250,81,143,783,143,0.638393,0.319285,1103,2463865.571322,467.354006,...,True,False,False,2004,200403J,60.671727,11.145154,14.160339,74442.411493,627709.949953
3,413,112,271,902,271,0.707572,0.457871,1104,1664514.050366,468.618337,...,True,False,False,2004,200403J,64.981444,4.991931,11.1449,103307.967471,742403.924155
4,324,84,205,910,205,0.709343,0.356044,1105,2383380.691068,286.896614,...,True,False,False,2004,200403J,64.349427,5.123913,13.337303,99354.10362,744537.791721


In [232]:
# put all the dataframes together.
# data is the dictionary of dataframes
def combine_dataframes(data):
    n=0
    for d in data.keys():
        for p in data[d]['props'].keys():
            print('working on ',d,p)
            df=data[d]['props'][p]['data']
            print(len(df))
            if n==0:
                df_new = df
            elif n>=1:
                df_new = pd.concat([df,df_old],axis=0)
            df_old = df_new
            n+=1
    return(df_old)

       
all_data = combine_dataframes(vote_data_test7)

working on  201511 I
582
working on  201511 D
582
working on  200806 G
561
working on  200806 F
561
working on  201311 C
409
working on  201311 B
409
working on  200411 A
564
working on  199806 K
643
working on  199806 I
643
working on  199806 E
643
working on  200003 C
681
working on  200611 G
562
working on  199706 F
533
working on  201411 F
582
working on  199711 H
598
working on  201406 B
597
working on  200211 R
691
working on  200211 B
691
working on  199811 E
643
working on  200111 D
684
working on  199911 H
680
working on  199911 I
680
working on  199911 J
680
working on  200011 L
688
working on  200011 K
688
working on  200203 D
688
working on  200403 J
581
working on  199603 B
649


In [233]:
# fill na with False, for year dummies
yr_cols = ['yr_1996', 'yr_1997', 'yr_1998','yr_1999', 'yr_2000', 'yr_2001', 'yr_2002', 'yr_2004', 'yr_2006',
       'yr_2008', 'yr_2013', 'yr_2014', 'yr_2015']

all_data[yr_cols] = all_data[yr_cols].fillna(value=False)

In [234]:
# rename columns to get rid of "_wgt"
new_col_names=[]
for col in list(all_data.columns):
    if col[-4:]=='_wgt':
        new_col = col[:-4]
    else:
        new_col = col
    #print(new_col)
    new_col_names.append(new_col)
all_data.columns = new_col_names


In [235]:
# save!

date.today().strftime('%m%d%Y')
filepath = '/Users/lisarayle/Dropbox/sf_data/elections/'
d = date.today().strftime('%m%d%Y')
filename = 'voting_data_all_{}.csv'.format(d)

all_data.to_csv(filepath+filename, index=False)
# Great! next step is to do regression in R.

In [236]:
# save voting data combined by precinct for only after 2012, for visualization. 
data_after_2012= all_data[((all_data.yr_2013==True) | (all_data.yr_2014==True))|(all_data.yr_2015==True)]
print(len(data_after_2012))
grouped = data_after_2012.groupby(by='precname').mean()['pct_nimby']
grouped.head()

filepath = '/Users/lisarayle/Dropbox/sf_data/elections/'
filename = 'vote_results_after_2012_by_precinct.csv'

grouped.to_csv(filepath+filename)

3161


In [237]:
# unneeded stuff, I think 
test_yr = voting2census_year('201401')
print(test_yr)
#census_test = get_census_data(test_yr)
#census_test.precname.unique()



# the censusXprecinct data does not match up. 

NameError: name 'voting2census_year' is not defined