This notebook runs the SF election data preparation and analysis

In [193]:
import pandas as pd
import numpy as np
import os
import re
import openpyxl
from xlrd import open_workbook
from datetime import date

from geopandas import GeoDataFrame, read_file

import data_prep_functions as dpf
from spatial_processing_functions import load_prec_shp


## Extract data from election result .xls files

In [194]:
datapath = '../data/SOV_w_nimby/'

# read all files in the folder
# keep data in a dictionary, where keys are election date (e.g., 200111)
file_list = os.listdir(datapath)
vote_data = {}
for f in file_list:
    d=f.strip('SOV').strip('.xls')
    d=d[:-2]
    if d[0]=='9':
        d='19'+d
    else:
        d='20'+d
    vote_data[d]={}
    vote_data[d]['filename'] =f


In [195]:
# define regex search term that will match name of worksheet with local propositions
phrase = 'prop|meas'
phrase_state = 'state|st'

for d in vote_data.keys(): 
    f=vote_data[d]['filename']
    # first only do .xls files.. because need different package to read these.
    if f.split('.')[1]=='xls':      
        #print(f)
        wkbk=open_workbook(datapath+f)
        
        # Find the sheet(s) with local ballot proposals. 
        sheets_w_props=dpf.find_matching_sheets(wkbk, to_match=phrase, to_not_match=phrase_state)
        vote_data[d]['sheet_names'] = sheets_w_props

                
# Here are ones with weird formats. 
vote_data['199711']['sheet_names']=['A - D','E - F']
vote_data['199706']['sheet_names']=['970603']
vote_data['199911']['sheet_names']=['E to H','I to K']

vote_data['201411']['sheet_names']=['370 - Local Measure F']
vote_data['201511']['sheet_names']=['180 - Local Measure D','205 - Local Measure I']



In [196]:
# now find the data within each sheet

# Define the ballot props we want for each election:
proposals = pd.read_excel('../data/BallotPropositions_nimby2.xlsx')  
proposals.head()
proposals['Year_str']=proposals['Year'].astype(str)
proposals['Mo_str']=proposals['Month2'].astype(str)

for i,d in enumerate(proposals['Mo_str']):
    if len(d)==1:
        d='0'+d
    proposals.loc[i,'Mo_str']=d

proposals['Date_str']=proposals['Year_str']+proposals['Mo_str']
#proposals.head()

for d in vote_data.keys():
    p_list=list(proposals[proposals['Date_str']==d]['Letter'].values)
    vote_data[d]['props'] = dict.fromkeys(p_list,{})


In [197]:
# Edit sheetnames so only the ones we need are listed
vote_data['199711']['sheet_names']=['E - F']
vote_data['199811']['sheet_names']= ['City Prop A-E']
vote_data['200011']['sheet_names']=['Prop K-O']
vote_data['200111']['sheet_names']=['AMENDMENTS']

In [198]:
# find the data for that prop in each sheet
for d in vote_data.keys():
    #print(d)
    letters=list(vote_data[d]['props'].keys())
    # we want one sheet for each prop letter

    # I think I should handle them differently if there are multiple props
    # there might be multiple letters, there might be multiple sheets, or both. 
    # but if there's only one letter, there's only one sheet.
    if len(letters) >1:
        # when there are multiple letters, there may be multiple sheets, or just one sheet. 
        # if there are multiple sheets, need to make sure the sheet matches the letter. There are only 3 of these
        # should match these by hand.
        if len(vote_data[d]['sheet_names'])>1:
                print('has multiple sheets:',d)
                
        elif len(vote_data[d]['sheet_names'])<=1:
            # when there are multiple letter but one sheet, easy to match each letter to the one sheet
            #print('has one sheet, multiple letters:',d)
            for l in letters:
                vote_data[d]['props'][l] = {'s_name':vote_data[d]['sheet_names'][0]}

    else:
        # there's only one sheet, which will match the one letter. 
        vote_data[d]['props'][letters[0]]={'s_name':vote_data[d]['sheet_names'][0]}

# where multiple sheets and mult letters, match by hand.
vote_data['201511']['props']['D']={'s_name':'180 - Local Measure D'}
vote_data['201511']['props']['I']={'s_name':'205 - Local Measure I'}
vote_data['199911']['props']['H']={'s_name':'E to H','col_name':'PROP H'}
vote_data['199911']['props']['I']={'s_name':'I to K','col_name':'PROP I'}
vote_data['199911']['props']['J']={'s_name':'I to K','col_name':'PROP J'}
vote_data['201311']['props']['B']={'s_name':'Measure A & B'}
vote_data['201311']['props']['C']={'s_name':'Measure C & D'}

has multiple sheets: 199911
has multiple sheets: 201511
has multiple sheets: 201311


In [199]:
# Define the custom parameters we'll need to read in the excel files. 
vote_data = dpf.define_excel_params(vote_data)

In [200]:
# read excel files for each election... 
for d in list(vote_data.keys()):
    for l in vote_data[d]['props'].keys():
        #print('\n',d,l)
        data=dpf.read_vote_sheet(d,l,vote_df=vote_data, path='../data/SOV_w_nimby/')
        # test if already has multiindex:
        if isinstance(data.index, pd.core.index.MultiIndex): 
            data=dpf.rename_index_and_cols(data)

        else:
            desc = dpf.check_if_descriptive(d)
            data=dpf.format_df_to_multiindex(data, descriptive_labels=desc)
            data=dpf.rename_index_and_cols(data)
        
        vote_data[d]['props'][l]['data']=data
#data.head()


In [201]:
# Verify totals

dpf.verify_vote_totals(vote_data, fname='verify_percentages.csv')

# Check sums verify. Some of these are off. 
# For the 90s elections, there are three mysterious "ballot types" at the beginning that don't have a precinct.
# that's what's throwing off the totals. Might be votes for people who don't vote at an address, like prisoners or something.
# So just make sure that, when reporting full results, rely on the original sheet and not the dataframe. 
# otherwise everything checks out. 
   

## Process data

In [203]:

vote_data1 = dpf.process_votedata(vote_data, dpf.consolidate_abs)
vote_data1.keys()
vote_data1['200806']['props']['G']['data'].head()

Unnamed: 0_level_0,voted,YES,NO,registered
precinct,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PCT 1101,315,201,90,806
PCT 1102,397,247,115,937
PCT 1103,283,169,90,750
PCT 1104,335,198,112,844
PCT 1105,282,161,92,884


## define variables

In [204]:
vote_data2 = dpf.process_votedata(vote_data1, dpf.make_vote_variables, use_datekey=True, use_propkey=True,df_prop=proposals)

In [205]:
# test cell - left off here.
# still have a small amount of missing data. 
# '201311', 'B', len: 409
# '199811','E',len: 644
# '199706', 'F',len: 534. 


yrmo = '199706'
prop = 'F'
df_test = vote_data4[yrmo]['props'][prop]['data']
df_test.head()
len(df_test.precname.unique())


680

## Merge voting data with census data by precinct

In [206]:
# format precinct names 
vote_data3 = dpf.process_votedata(vote_data2, dpf.format_precincts)

# let's merge!
vote_data4 = dpf.process_votedata(vote_data3, dpf.merge_vote_census, use_datekey=True)



 for election  199811
length vote data:  687 length census data:  712
length new data:  680

 for election  200111
length vote data:  690 length census data:  712
length new data:  684

 for election  200211
length vote data:  713 length census data:  712
length new data:  691

 for election  200211
length vote data:  713 length census data:  712
length new data:  691

 for election  199911
length vote data:  687 length census data:  712
length new data:  680

 for election  199911
length vote data:  687 length census data:  712
length new data:  680

 for election  199911
length vote data:  687 length census data:  712
length new data:  680

 for election  199706
length vote data:  687 length census data:  712
length new data:  686

 for election  200203
length vote data:  700 length census data:  712
length new data:  688

 for election  201511
length vote data:  597 length census data:  603
length new data:  582

 for election  201511
length vote data:  597 length census data:  603

In [207]:
df_test = vote_data4['200403']['props']['J']['data']
df_test.columns

Index(['voted', 'YES', 'NO', 'registered', 'tot_nimby_votes', 'pct_nimby',
       'turnout', 'precname', 'area_m', 'tot_hu_wgt', 'occ_hu_wgt',
       'owned_wgt', 'rented_wgt', 'hu_detatched_wgt', 'hu_2_wgt', 'hu_3-4_wgt',
       'hu_5-9_wgt', 'hu_10-19_wgt', 'hu_20-49_wgt', 'hu_50_wgt',
       'families_wgt', 'tot_hhs_wgt', 'tot_pop_wgt', 'white_wgt', 'black_wgt',
       'asian_wgt', 'hispanic_wgt', 'med_yr_built_wgt', 'foreign_born_wgt',
       'med_yr_moved_all_wgt', 'med_yr_moved_owner_wgt', 'med_value_wgt',
       'med_inc_wgt', 'med_age_wgt'],
      dtype='object')

# Prepare data for R, then save

Make a dataset that combines all elections, but make sure to have proposal and year dummies

Create dummy variables:  
- year

- pres election year

- november election
 

In [208]:
vote_data5 = dpf.process_votedata(vote_data4, dpf.make_election_dummies, use_datekey=True, use_propkey=True)
vote_data6 = dpf.process_votedata(vote_data5, dpf.fix_yr_built_moved, use_datekey=True)
vote_data7 = dpf.process_votedata(vote_data6, dpf.adjust_inflation, use_datekey=True)

In [210]:
all_data = dpf.combine_dataframes(vote_data7)

working on  199811 E
680
working on  200111 D
684
working on  200211 B
691
working on  200211 R
691
working on  199911 J
680
working on  199911 I
680
working on  199911 H
680
working on  199706 F
686
working on  200203 D
688
working on  201511 D
582
working on  201511 I
582
working on  201411 F
582
working on  199711 H
686
working on  201406 B
582
working on  200011 K
688
working on  200011 L
688
working on  201311 B
582
working on  201311 C
582
working on  199806 K
686
working on  199806 I
686
working on  199806 E
686
working on  199603 B
674
working on  200403 J
581
working on  200003 C
681
working on  200611 G
562
working on  200411 A
564
working on  200806 F
562
working on  200806 G
562


In [211]:
# fill na with False, for year dummies
yr_cols = ['yr_1996', 'yr_1997', 'yr_1998','yr_1999', 'yr_2000', 'yr_2001', 'yr_2002', 'yr_2004', 'yr_2006',
       'yr_2008', 'yr_2013', 'yr_2014', 'yr_2015']

all_data[yr_cols] = all_data[yr_cols].fillna(value=False)

# rename columns to get rid of "_wgt"
all_data = dpf.rename_columns(all_data)


In [212]:
# save!

date.today().strftime('%m%d%Y')
filepath = '../results/'
d = date.today().strftime('%m%d%Y')
filename = 'voting_data_all_{}.csv'.format(d)

all_data.to_csv(filepath+filename, index=False)


In [213]:
 
yr_prop_cols = sorted(all_data.yr_prop.unique())
for yp in yr_prop_cols: 
    print(yp, len(all_data[all_data.yr_prop==yp]))


199603B 674
199706F 686
199711H 686
199806E 686
199806I 686
199806K 686
199811E 680
199911H 680
199911I 680
199911J 680
200003C 681
200011K 688
200011L 688
200111D 684
200203D 688
200211B 691
200211R 691
200403J 581
200411A 564
200611G 562
200806F 562
200806G 562
201311B 582
201311C 582
201406B 582
201411F 582
201511D 582
201511I 582


# Prepare data for visualization

- Join result data with precincts, on prec ID, matching appropriate years.

- Write to geojson files - 1 geojson file for each precinct layer (3 files)

In [215]:
# make a column with year_month
all_data['yr_mo'] = all_data['yr_prop'].str[:-1].astype(int)

prec_keys=['pre1992','pre2002','pre2012']

for prec_key in prec_keys:
    yr = prec_key[-4:]
    
    # load precinct shapefiles as geodataframes
    prec_df = load_prec_shp(yr)
    
    # filter all data based on the year
    vote_df = dpf.filter_for_dates(all_data, prec_key)  
    
    # merge on precinct id 
    merged = pd.merge(prec_df, vote_df, on='precname')
    
    # remove unneeded columns to reduce file size
    cols_to_drop = ['foreign_born','yr_1996', 'yr_1997', 'yr_1998','yr_1999', 'yr_2000', 'yr_2001', 'yr_2002', 'yr_2004', 'yr_2006',
       'yr_2008', 'yr_2013', 'yr_2014', 'yr_2015']
    try: 
        merged = merged.drop(cols_to_drop, axis=1)
    except ValueError as err:
        print('Error: {}'.format(err))
    
    # write to geojson by writing to a json string
    path= '../results/maps/'
    filename = 'results_{}.geojson'.format(prec_key)
    
    with open(path+filename, 'w') as f:
        f.write(merged.to_json())
        
    

omitted 1 row(s) with missing geometry
total 712 precincts
omitted 0 row(s) with missing geometry
total 586 precincts
omitted 0 row(s) with missing geometry
total 604 precincts
