This notebook runs the SF election data preparation and analysis

In [34]:
import pandas as pd
import numpy as np
import os
import re
import openpyxl
from xlrd import open_workbook
from datetime import date

from geopandas import GeoDataFrame, read_file

import data_prep_functions as dpf
from spatial_processing_functions import load_prec_shp


## Extract data from election result .xls files

In [35]:
datapath = '../data/SOV_w_nimby/'

# read all files in the folder
# keep data in a dictionary, where keys are election date (e.g., 200111)
file_list = os.listdir(datapath)
vote_data = {}
for f in file_list:
    d=f.strip('SOV').strip('.xls')
    d=d[:-2]
    if d[0]=='9':
        d='19'+d
    else:
        d='20'+d
    vote_data[d]={}
    vote_data[d]['filename'] =f


In [36]:
# define regex search term that will match name of worksheet with local propositions
phrase = 'prop|meas'
phrase_state = 'state|st'

for d in vote_data.keys(): 
    f=vote_data[d]['filename']
    # first only do .xls files.. because need different package to read these.
    if f.split('.')[1]=='xls':      
        #print(f)
        wkbk=open_workbook(datapath+f)
        
        # Find the sheet(s) with local ballot proposals. 
        sheets_w_props=dpf.find_matching_sheets(wkbk, to_match=phrase, to_not_match=phrase_state)
        vote_data[d]['sheet_names'] = sheets_w_props

                
# Here are ones with weird formats. 
vote_data['199711']['sheet_names']=['A - D','E - F']
vote_data['199706']['sheet_names']=['970603']
vote_data['199911']['sheet_names']=['E to H','I to K']

vote_data['201411']['sheet_names']=['370 - Local Measure F']
vote_data['201511']['sheet_names']=['180 - Local Measure D','205 - Local Measure I']



In [37]:
# now find the data within each sheet

# Define the ballot props we want for each election:
proposals = pd.read_excel('../data/BallotPropositions_nimby2.xlsx')  
proposals.head()
proposals['Year_str']=proposals['Year'].astype(str)
proposals['Mo_str']=proposals['Month2'].astype(str)

for i,d in enumerate(proposals['Mo_str']):
    if len(d)==1:
        d='0'+d
    proposals.loc[i,'Mo_str']=d

proposals['Date_str']=proposals['Year_str']+proposals['Mo_str']
#proposals.head()

for d in vote_data.keys():
    p_list=list(proposals[proposals['Date_str']==d]['Letter'].values)
    vote_data[d]['props'] = dict.fromkeys(p_list,{})


In [38]:
# Edit sheetnames so only the ones we need are listed
vote_data['199711']['sheet_names']=['E - F']
vote_data['199811']['sheet_names']= ['City Prop A-E']
vote_data['200011']['sheet_names']=['Prop K-O']
vote_data['200111']['sheet_names']=['AMENDMENTS']

In [39]:
# find the data for that prop in each sheet
for d in vote_data.keys():
    #print(d)
    letters=list(vote_data[d]['props'].keys())
    # we want one sheet for each prop letter

    # I think I should handle them differently if there are multiple props
    # there might be multiple letters, there might be multiple sheets, or both. 
    # but if there's only one letter, there's only one sheet.
    if len(letters) >1:
        # when there are multiple letters, there may be multiple sheets, or just one sheet. 
        # if there are multiple sheets, need to make sure the sheet matches the letter. There are only 3 of these
        # should match these by hand.
        if len(vote_data[d]['sheet_names'])>1:
                print('has multiple sheets:',d)
                
        elif len(vote_data[d]['sheet_names'])<=1:
            # when there are multiple letter but one sheet, easy to match each letter to the one sheet
            #print('has one sheet, multiple letters:',d)
            for l in letters:
                vote_data[d]['props'][l] = {'s_name':vote_data[d]['sheet_names'][0]}

    else:
        # there's only one sheet, which will match the one letter. 
        vote_data[d]['props'][letters[0]]={'s_name':vote_data[d]['sheet_names'][0]}

# where multiple sheets and mult letters, match by hand.
vote_data['201511']['props']['D']={'s_name':'180 - Local Measure D'}
vote_data['201511']['props']['I']={'s_name':'205 - Local Measure I'}
vote_data['199911']['props']['H']={'s_name':'E to H','col_name':'PROP H'}
vote_data['199911']['props']['I']={'s_name':'I to K','col_name':'PROP I'}
vote_data['199911']['props']['J']={'s_name':'I to K','col_name':'PROP J'}
vote_data['201311']['props']['B']={'s_name':'Measure A & B'}
vote_data['201311']['props']['C']={'s_name':'Measure C & D'}

has multiple sheets: 201311
has multiple sheets: 201511
has multiple sheets: 199911


In [40]:
# Define the custom parameters we'll need to read in the excel files. 
vote_data = dpf.define_excel_params(vote_data)

In [41]:
# read excel files for each election... 
for d in list(vote_data.keys()):
    for l in vote_data[d]['props'].keys():
        #print('\n',d,l)
        data=dpf.read_vote_sheet(d,l,vote_df=vote_data, path='../data/SOV_w_nimby/')
        # test if already has multiindex:
        if isinstance(data.index, pd.core.index.MultiIndex): 
            data=dpf.rename_index_and_cols(data)

        else:
            desc = dpf.check_if_descriptive(d)
            data=dpf.format_df_to_multiindex(data, descriptive_labels=desc)
            data=dpf.rename_index_and_cols(data)
        
        vote_data[d]['props'][l]['data']=data
#data.head()


In [42]:
# Verify totals

dpf.verify_vote_totals(vote_data, fname='verify_percentages.csv')

# Check sums verify. Some of these are off. 
# For the 90s elections, there are three mysterious "ballot types" at the beginning that don't have a precinct.
# that's what's throwing off the totals. Might be votes for people who don't vote at an address, like prisoners or something.
# So just make sure that, when reporting full results, rely on the original sheet and not the dataframe. 
# otherwise everything checks out. 
   

## Process data

In [48]:

vote_data1 = dpf.process_votedata(vote_data, dpf.consolidate_abs)
vote_data1.keys()
print(vote_data['200806']['props']['G']['data'].head())


          voted  YES   NO  registered
precinct                             
PCT 1101    315  201   90         806
PCT 1102    397  247  115         937
PCT 1103    283  169   90         750
PCT 1104    335  198  112         844
PCT 1105    282  161   92         884
PCT 1106    356  168  111         865
PCT 1107    324  184  120         876
PCT 1108    248  147   82         754
PCT 1109    304  180  103         817
PCT 1111    224  125   74         575
               voted  YES   NO  registered
precinct                                  
PCT 2001         218   85  100         441
PCT 2002         402  161  181        1006
PCT 2003         355  133  175         778
PCT 2004         275  103  128         645
PCT 2005         415  173  197         972
PCT 2006         255   80  136         563
PCT 2007         296  104  156         672
PCT 2008         266   90  137         599
PCT 2009         419  164  195         932
PCT 2011/2012    456  171  209        1090


## define variables

In [32]:
vote_data2 = dpf.process_votedata(vote_data1, dpf.make_vote_variables, use_datekey=True, use_propkey=True,df_prop=proposals)

## Merge voting data with census data by precinct

In [33]:
# format precinct names 
vote_data3 = dpf.process_votedata(vote_data2, dpf.format_precincts)

# let's merge!
vote_data4 = dpf.process_votedata(vote_data3, dpf.merge_vote_census, use_datekey=True)


KeyboardInterrupt: 

In [None]:
df_test = vote_data4['200403']['props']['J']['data']
df_test.columns

# Prepare data for R, then save

Make a dataset that combines all elections, but make sure to have proposal and year dummies

Create dummy variables:  
- year

- pres election year

- november election
 

In [None]:
vote_data5 = dpf.process_votedata(vote_data4, dpf.make_election_dummies, use_datekey=True, use_propkey=True)
vote_data6 = dpf.process_votedata(vote_data5, dpf.fix_yr_built_moved, use_datekey=True)
vote_data7 = dpf.process_votedata(vote_data6, dpf.adjust_inflation, use_datekey=True)

In [None]:
all_data = dpf.combine_dataframes(vote_data7)

In [None]:
# fill na with False, for year dummies
yr_cols = ['yr_1996', 'yr_1997', 'yr_1998','yr_1999', 'yr_2000', 'yr_2001', 'yr_2002', 'yr_2004', 'yr_2006',
       'yr_2008', 'yr_2013', 'yr_2014', 'yr_2015']

all_data[yr_cols] = all_data[yr_cols].fillna(value=False)

# rename columns to get rid of "_wgt"
all_data = dpf.rename_columns(all_data)


In [None]:
# save!

date.today().strftime('%m%d%Y')
filepath = '../results/'
d = date.today().strftime('%m%d%Y')
filename = 'voting_data_all_{}.csv'.format(d)

all_data.to_csv(filepath+filename, index=False)


In [None]:
 
yr_prop_cols = sorted(all_data.yr_prop.unique())
for yp in yr_prop_cols: 
    print(yp, len(all_data[all_data.yr_prop==yp]))


# Prepare data for visualization

- Join result data with precincts, on prec ID, matching appropriate years.

- Write to geojson files - 1 geojson file for each precinct layer (3 files)

In [None]:
# make a column with year_month
all_data['yr_mo'] = all_data['yr_prop'].str[:-1].astype(int)

prec_keys=['pre1992','pre2002','pre2012']

for prec_key in prec_keys:
    yr = prec_key[-4:]
    
    # load precinct shapefiles as geodataframes
    prec_df = load_prec_shp(yr)
    
    # filter all data based on the year
    vote_df = dpf.filter_for_dates(all_data, prec_key)  
    
    # merge on precinct id 
    merged = pd.merge(prec_df, vote_df, on='precname')
    
    # remove unneeded columns to reduce file size
    cols_to_drop = ['foreign_born','yr_1996', 'yr_1997', 'yr_1998','yr_1999', 'yr_2000', 'yr_2001', 'yr_2002', 'yr_2004', 'yr_2006',
       'yr_2008', 'yr_2013', 'yr_2014', 'yr_2015']
    try: 
        merged = merged.drop(cols_to_drop, axis=1)
    except ValueError as err:
        print('Error: {}'.format(err))
    
    # write to geojson by writing to a json string
    path= '../results/maps/'
    filename = 'results_{}.geojson'.format(prec_key)
    
    with open(path+filename, 'w') as f:
        f.write(merged.to_json())
        
    