# VEST MI 2020 - US President and US Senate

**TODO/where lily left off (last updated 9/9/2021):**
- Figure out county 153 and 015 problem - votes reversed as you can see below
- match remaining precinct ids (less than 100 out of 4700, however it is by number not by name so a little harder to hand match...)
- make manual vote adjustments noted in documentation and re-run avcb reallocation
- make modifications for AVCBs, counties, etc that are listed in documentation
- run validation for df and gdf

In [1]:
import pandas as pd
import geopandas as gp
import numpy as np
import matplotlib.pyplot as plt

## Load VEST File

In [2]:
gdfv = gp.read_file('./raw_from_source/mi_2020/mi_2020.shp')
gdfv.head()

Unnamed: 0,PRECINCTID,COUNTYFIPS,cousubname,elexpre,G20PRERTRU,G20PREDBID,G20PRELJOR,G20PREGHAW,G20PRENDEL,G20PRETBLA,G20USSRJAM,G20USSDPET,G20USSGSQU,G20USSNDER,G20USSTWIL,geometry
0,WP-001-01040-00001,1,Alcona township,001-ALCONA TOWNSHIP-0-0001,564,248,3,2,0,2,539,267,4,2,3,"POLYGON ((-83.29467 44.77346, -83.29577 44.773..."
1,WP-001-12460-00001,1,Caledonia township,001-CALEDONIA TOWNSHIP-0-0001,508,245,4,0,0,0,485,261,1,0,4,"POLYGON ((-83.64206 44.81382, -83.64578 44.813..."
2,WP-001-19320-00001,1,Curtis township,001-CURTIS TOWNSHIP-0-0001,486,238,2,1,0,1,456,240,5,4,10,"POLYGON ((-83.64530 44.51091, -83.64918 44.510..."
3,WP-001-34820-00001,1,Greenbush township,001-GREENBUSH TOWNSHIP-0-0001,560,302,9,1,0,1,531,322,4,5,6,"POLYGON ((-83.31858 44.51165, -83.32054 44.511..."
4,WP-001-35740-00001,1,Gustin township,001-GUSTIN TOWNSHIP-0-0001,317,112,9,0,0,0,306,122,1,0,6,"POLYGON ((-83.40227 44.59806, -83.41508 44.598..."


## Load Shapefile

In [3]:
shp = gp.read_file('./raw_from_source/2020_Voting_Precincts/2020_Voting_Precincts.shp')
crs = gdfv.crs
shp = shp.to_crs(crs)
shp.head()

Unnamed: 0,PRECINCTID,ELECTIONYE,COUNTYFIPS,MCDFIPS,WARD,PRECINCT,OBJECTID_1,OBJECTID,NAME,STATEFP,VTDST,FUNCSTAT,VTDI,LSAD,ShapeSTAre,ShapeSTLen,geometry
0,WP-001-01040-00001,2020,1,1040,0,1,1,1,10104000001,26,1001,N,A,V1,170107600.0,73869.558749,"POLYGON ((-83.29467 44.77345, -83.29577 44.773..."
1,WP-001-12460-00001,2020,1,12460,0,1,2,2,11246000001,26,1002,N,A,V1,188086000.0,76737.144518,"POLYGON ((-83.64206 44.81382, -83.64578 44.813..."
2,WP-001-19320-00001,2020,1,19320,0,1,3,3,11932000001,26,1003,N,A,V1,183323100.0,57470.528359,"POLYGON ((-83.64530 44.51090, -83.64918 44.510..."
3,WP-001-34820-00001,2020,1,34820,0,1,4,4,13482000001,26,1004,N,A,V1,67679580.0,32987.191075,"POLYGON ((-83.31858 44.51164, -83.32053 44.511..."
4,WP-001-35740-00001,2020,1,35740,0,1,5,5,13574000001,26,1005,N,A,V1,92810100.0,38642.299469,"POLYGON ((-83.40226 44.59805, -83.41507 44.598..."


## Load Election Results

In [18]:
## Load + Process format
df = pd.read_csv('./raw_from_source/2020GEN/2020vote.txt', delimiter = '\t', header = None)

#columns defined in documentation
df.columns = ['election_year','election_type','office_code','district_code',
            'status_code','candidate_id','county_code','city_town_code','ward_number',
            'precinct_number','precinct_label','precinct_votes','na_col']

#Processing
#Filter to office code `1` for president
df = df[(df['office_code'] == 1)|(df['office_code'] == 5)]

#Have to replace NaNs in `precinct_label` with empty strings so that we can use it as an index in the pivot
df['precinct_label'] = df['precinct_label'].replace(np.nan, '', regex = True)

#Candidate codes found in raw_from_source>2020GEN>2020name.txt
candidate_cols = [-1403, -1401, -1385, -1373, -1398, -1391, -1283, -1304, -1365, -1397, -1393]
df = df[df['candidate_id'].isin(candidate_cols)]

## PIVOT RESULTS
df_pivot = df.pivot_table(index = ['county_code','city_town_code','ward_number','precinct_number','precinct_label'],
                         columns = ['candidate_id'],
                        values = ['precinct_votes'],
                         aggfunc = 'sum')
df_pivot.reset_index(inplace = True)

#Note that this file does not have an "other" column for other candidates - different from previous years

candidate_id_dict = {(    'county_code',    ''):'county_code',( 'city_town_code',    ''):'city_town_code',
                     (    'ward_number',    ''):'ward_number',('precinct_number',    ''):'precinct_number',
                     ( 'precinct_label',    ''):'precinct_label',
                     ('precinct_votes', -1403):'G20PRERTRU', ('precinct_votes', -1401):'G20PREDBID',('precinct_votes', -1385):'G20PRELJOR', 
                    ('precinct_votes', -1373):'G20PREGHAW', ('precinct_votes', -1398):'G20PRENDEL',('precinct_votes', -1391):'G20PRETBLA', 
                    ('precinct_votes', -1283):'G20USSRJAM',('precinct_votes', -1304):'G20USSDPET',('precinct_votes', -1365):'G20USSGSQU', 
                    ('precinct_votes', -1397):'G20USSNDER',('precinct_votes', -1393):'G20USSTWIL'}

df_pivot.columns = df_pivot.columns.map(candidate_id_dict)

In [19]:
df_pivot.columns

Index(['county_code', 'city_town_code', 'ward_number', 'precinct_number',
       'precinct_label', 'G20PRERTRU', 'G20PREDBID', 'G20PRENDEL',
       'G20USSNDER', 'G20USSTWIL', 'G20PRETBLA', 'G20PRELJOR', 'G20PREGHAW',
       'G20USSGSQU', 'G20USSDPET', 'G20USSRJAM'],
      dtype='object')

In [20]:
df_pivot

Unnamed: 0,county_code,city_town_code,ward_number,precinct_number,precinct_label,G20PRERTRU,G20PREDBID,G20PRENDEL,G20USSNDER,G20USSTWIL,G20PRETBLA,G20PRELJOR,G20PREGHAW,G20USSGSQU,G20USSDPET,G20USSRJAM
0,1,2,0,1,,564.0,248.0,0.0,2.0,3.0,2.0,3.0,2.0,4.0,267.0,539.0
1,1,4,0,1,,508.0,245.0,0.0,0.0,4.0,0.0,4.0,0.0,1.0,261.0,485.0
2,1,6,0,1,,486.0,238.0,0.0,4.0,10.0,1.0,2.0,1.0,5.0,240.0,456.0
3,1,8,0,1,,560.0,302.0,0.0,5.0,6.0,1.0,9.0,1.0,4.0,322.0,531.0
4,1,10,0,1,,317.0,112.0,0.0,0.0,6.0,0.0,9.0,0.0,1.0,122.0,306.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4918,83,32,0,1,,490.0,166.0,0.0,5.0,3.0,2.0,6.0,0.0,3.0,163.0,485.0
4919,83,52,0,1,,973.0,600.0,2.0,4.0,19.0,1.0,30.0,7.0,11.0,589.0,974.0
4920,83,52,0,2,,881.0,652.0,0.0,6.0,17.0,4.0,33.0,3.0,12.0,657.0,871.0
4921,83,52,0,3,,958.0,659.0,0.0,1.0,8.0,3.0,29.0,6.0,19.0,636.0,988.0


In [21]:
#WANT COUNTY FIPS IN ELECTION RESULTS FILE rather than just "county_code"
## using county code and county name, create county fips code in election results df
cty2020 = pd.read_csv('raw_from_source/2020GEN/county.txt',delimiter = '\t', header = None)
county_names = pd.read_csv('./raw_from_source/county_codes.csv')
df_pivot = df_pivot.merge(county_names, on = 'county_code')

#### Add City/Township Name
#Loading in the `2020city.txt` file that has the corresponding names for the city codes in the election results file
city_codes = pd.read_csv('./raw_from_source/2020GEN/2020city.txt',delimiter='\t',header=None)
city_codes.columns = ['election_year','election_type','county_code','city_town_code','city_town_name','null']
city_codes.drop(['election_year','election_type','null'],axis=1,inplace=True)
df_pivot = df_pivot.merge(city_codes, on = ['county_code','city_town_code'])

In [22]:
df_pivot

Unnamed: 0,county_code,city_town_code,ward_number,precinct_number,precinct_label,G20PRERTRU,G20PREDBID,G20PRENDEL,G20USSNDER,G20USSTWIL,G20PRETBLA,G20PRELJOR,G20PREGHAW,G20USSGSQU,G20USSDPET,G20USSRJAM,county_name,county_fips,city_town_name
0,1,2,0,1,,564.0,248.0,0.0,2.0,3.0,2.0,3.0,2.0,4.0,267.0,539.0,ALCONA,1,ALCONA TOWNSHIP
1,1,4,0,1,,508.0,245.0,0.0,0.0,4.0,0.0,4.0,0.0,1.0,261.0,485.0,ALCONA,1,CALEDONIA TOWNSHIP
2,1,6,0,1,,486.0,238.0,0.0,4.0,10.0,1.0,2.0,1.0,5.0,240.0,456.0,ALCONA,1,CURTIS TOWNSHIP
3,1,8,0,1,,560.0,302.0,0.0,5.0,6.0,1.0,9.0,1.0,4.0,322.0,531.0,ALCONA,1,GREENBUSH TOWNSHIP
4,1,10,0,1,,317.0,112.0,0.0,0.0,6.0,0.0,9.0,0.0,1.0,122.0,306.0,ALCONA,1,GUSTIN TOWNSHIP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4918,83,32,0,1,,490.0,166.0,0.0,5.0,3.0,2.0,6.0,0.0,3.0,163.0,485.0,WEXFORD,165,WEXFORD TOWNSHIP
4919,83,52,0,1,,973.0,600.0,2.0,4.0,19.0,1.0,30.0,7.0,11.0,589.0,974.0,WEXFORD,165,CADILLAC CITY
4920,83,52,0,2,,881.0,652.0,0.0,6.0,17.0,4.0,33.0,3.0,12.0,657.0,871.0,WEXFORD,165,CADILLAC CITY
4921,83,52,0,3,,958.0,659.0,0.0,1.0,8.0,3.0,29.0,6.0,19.0,636.0,988.0,WEXFORD,165,CADILLAC CITY


In [23]:
df_pivot[df_pivot['city_town_name'] == '{Statistical Adjustments}'].groupby('county_fips').sum()

Unnamed: 0_level_0,county_code,city_town_code,ward_number,precinct_number,G20PRERTRU,G20PREDBID,G20PRENDEL,G20USSNDER,G20USSTWIL,G20PRETBLA,G20PRELJOR,G20PREGHAW,G20USSGSQU,G20USSDPET,G20USSRJAM
county_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,1,9999,0,9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3,9999,0,9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,9,9999,0,9999,-32.0,-35.0,0.0,0.0,0.0,0.0,-3.0,-1.0,-1.0,-32.0,-37.0
25,13,9999,0,9999,2.0,8.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,7.0,5.0
27,14,9999,0,9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,15,9999,0,9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33,17,9999,0,9999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,18,9999,0,9999,-18.0,-9.0,0.0,0.0,-1.0,-1.0,0.0,0.0,0.0,-8.0,-19.0
37,19,9999,0,9999,-314.0,-924.0,-1.0,-2.0,-3.0,0.0,-5.0,-4.0,-4.0,-892.0,-348.0
45,23,9999,0,9999,-309.0,-1648.0,-2.0,-4.0,-27.0,-4.0,-19.0,-9.0,-33.0,-1636.0,-272.0


## Remove statistical adjustments?

**From documentation, it seems to me VEST cut out the STAT ADJUSTs and did not redistribute. Am I (lily) interpreting correctly?**

"Some counties report "Statistical Adjustments" with their precinct results, which can either be positive or negative. Many of these are related to cities that span county lines, as they often match precinct results of these county-spanning cities. In some cases these adjustments cancel each other out across the two counties that share the city, but some (notably Clinton County's East Lansing and Eaton County's Lansing adjustments) are not reciprocated, which introduces error to statewide totals. Other than these, there appears to be a handful of tiny adjustments that were made to make the precinct totals match official countywide totals. None of these adjustments of any type were distributed to precincts, which may result in candidate totals being slightly off official totals."

In [24]:
df_pivot = df_pivot[df_pivot['city_town_name'] != '{Statistical Adjustments}']

## Check Election Results at State and County Levels

In [25]:
#Function to check column/race totals
def colum_total_check(column_list, elections_df, vest_df):
    for val in column_list:
        vote_dif = elections_df[val].sum()-vest_df[val].sum()
        if (vote_dif == 0):
            print(val+": EQUAL", ' - total: ', str(elections_df[val].sum()))
        else:
            print(val+": DIFFERENCE OF " + str(vote_dif)+ " VOTES", '  rdh total: ', str(elections_df[val].sum()), ', vest total: ', str(vest_df[val].sum()))
            
#Function to check county totals
def county_total_check(column_list, elections_df, vest_df):
    print("Counties with differences printed below:")
    diff_counties=[]
    for i in column_list:
        diff = elections_df.groupby(["county_fips"]).sum()[i]-vest_df.groupby(["COUNTYFIPS"]).sum()[i]
        for val in diff[diff != 0].index.values.tolist():
            if val not in diff_counties:
                diff_counties.append(val)
        if len(diff[diff != 0]!=0):
            print(i)
            #print(diff)
            print(diff[diff != 0].to_string(header=False))
    print("")
    print("All other races in all counties are equal")

In [26]:
#Set up to run checks
df_pivot['county_fips'] = df_pivot['county_fips'].map(lambda x: str(x).zfill(3))
df_pivot = df_pivot.reset_index()
df_pivot = df_pivot.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pivot['county_fips'] = df_pivot['county_fips'].map(lambda x: str(x).zfill(3))


In [27]:
column_list = ['G20PRERTRU','G20PREDBID', 'G20PRELJOR', 'G20PREGHAW', 'G20PRENDEL', 'G20PRETBLA',
       'G20USSRJAM', 'G20USSDPET', 'G20USSGSQU', 'G20USSNDER', 'G20USSTWIL']

colum_total_check(column_list, df_pivot, gdfv)

G20PRERTRU: EQUAL  - total:  2649859.0
G20PREDBID: EQUAL  - total:  2804036.0
G20PRELJOR: EQUAL  - total:  60406.0
G20PREGHAW: EQUAL  - total:  13718.0
G20PRENDEL: EQUAL  - total:  2985.0
G20PRETBLA: EQUAL  - total:  7204.0
G20USSRJAM: EQUAL  - total:  2642221.0
G20USSDPET: EQUAL  - total:  2734558.0
G20USSGSQU: EQUAL  - total:  39217.0
G20USSNDER: EQUAL  - total:  13093.0
G20USSTWIL: EQUAL  - total:  50596.0


In [28]:
county_total_check(column_list, df_pivot, gdfv)

Counties with differences printed below:
G20PRERTRU
015    550.0
153   -550.0
G20PREDBID
015    480.0
153   -480.0
G20PRELJOR
015    17.0
153   -17.0
G20PREGHAW
015    2.0
153   -2.0
G20PRENDEL
015    1.0
153   -1.0
G20PRETBLA
015    1.0
153   -1.0
G20USSRJAM
015    569.0
153   -569.0
G20USSDPET
015    442.0
153   -442.0
G20USSGSQU
015    9.0
153   -9.0
G20USSNDER
015    3.0
153   -3.0
G20USSTWIL
015    18.0
153   -18.0

All other races in all counties are equal


Why the problem in county 015 and 153 specifically? Strange how the differences between elections file and VEST file are inverse between the counties, and yet just looking at the results do not appear to be related...

And they do not appear in the documentation as the counties with overlap.

Note however the repeat in city_town_name in 015... though i guess we do not expect that to be unique?

In [38]:
(df_pivot.city_town_name + df_pivot.county_fips).nunique()

1534

In [36]:
df_pivot[(df_pivot.county_fips == '015')]

Unnamed: 0,index,county_code,city_town_code,ward_number,precinct_number,precinct_label,G20PRERTRU,G20PREDBID,G20PRENDEL,G20USSNDER,G20USSTWIL,G20PRETBLA,G20PRELJOR,G20PREGHAW,G20USSGSQU,G20USSDPET,G20USSRJAM,county_name,county_fips,city_town_name
123,125,8,2,0,1,,888.0,274.0,1.0,5.0,14.0,1.0,8.0,1.0,9.0,284.0,845.0,BARRY,15,ASSYRIA TOWNSHIP
124,126,8,4,0,1,,787.0,283.0,2.0,8.0,19.0,1.0,15.0,2.0,3.0,279.0,766.0,BARRY,15,BALTIMORE TOWNSHIP
125,127,8,6,0,1,,1202.0,700.0,0.0,6.0,17.0,4.0,26.0,12.0,21.0,649.0,1231.0,BARRY,15,BARRY TOWNSHIP
126,128,8,8,0,1,,989.0,425.0,1.0,3.0,20.0,3.0,21.0,0.0,3.0,414.0,992.0,BARRY,15,CARLTON TOWNSHIP
127,129,8,10,0,1,,1038.0,457.0,1.0,9.0,19.0,1.0,14.0,6.0,10.0,473.0,989.0,BARRY,15,CASTLETON TOWNSHIP
128,130,8,12,0,1,,1156.0,555.0,1.0,6.0,8.0,1.0,24.0,4.0,9.0,557.0,1152.0,BARRY,15,HASTINGS TOWNSHIP
129,131,8,14,0,1,,1253.0,621.0,0.0,7.0,28.0,4.0,24.0,1.0,9.0,619.0,1225.0,BARRY,15,HOPE TOWNSHIP
130,132,8,16,0,1,,1517.0,583.0,1.0,2.0,13.0,4.0,28.0,4.0,10.0,571.0,1527.0,BARRY,15,IRVING TOWNSHIP
131,133,8,18,0,1,,1186.0,578.0,1.0,5.0,17.0,3.0,20.0,3.0,14.0,579.0,1157.0,BARRY,15,JOHNSTOWN TOWNSHIP
132,134,8,20,0,1,,608.0,191.0,0.0,3.0,6.0,4.0,5.0,1.0,3.0,183.0,600.0,BARRY,15,MAPLE GROVE TOWNSHIP


In [30]:
df_pivot[(df_pivot.county_fips == '153')].groupby('county_fips').sum()

Unnamed: 0_level_0,index,county_code,city_town_code,ward_number,precinct_number,G20PRERTRU,G20PREDBID,G20PRENDEL,G20USSNDER,G20USSTWIL,G20PRETBLA,G20PRELJOR,G20PREGHAW,G20USSGSQU,G20USSDPET,G20USSRJAM
county_fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
153,38907,847,228,0,14,3090.0,1589.0,4.0,11.0,27.0,10.0,40.0,12.0,17.0,1712.0,2905.0


**NOTE:** Counties above with differences (015 and 153) do not appear in list of counties that had statistical adjustments... Appears votes totals at the county level are reversed between the two, but are they also reversed by precinct?

In [34]:
gdfv[(gdfv.COUNTYFIPS == '153')].groupby('COUNTYFIPS').sum()

Unnamed: 0_level_0,G20PRERTRU,G20PREDBID,G20PRELJOR,G20PREGHAW,G20PRENDEL,G20PRETBLA,G20USSRJAM,G20USSDPET,G20USSGSQU,G20USSNDER,G20USSTWIL
COUNTYFIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
153,3640,2069,57,14,5,11,3474,2154,26,14,45


## Read in Census File to get MCDFIPs code and more to use to recreate VEST's 'PRECINCTID' column

In [35]:
census = pd.read_csv('./raw_from_source/all-geocodes-v2020.csv')
censusmi = census[census['State Code (FIPS)'] == 26]
#After `Place Code` != 0, repeat values, so want to filter that out. Then see that # unique values = 1520 for County Sub, indicating same as `Juris` in `shp`
censusmi = censusmi[(censusmi['Place Code (FIPS)'] == 0) & (censusmi['County Subdivision Code (FIPS)'] != 0)]

Notes from 2016 to apply to 2020 as well:
- Census file uses "Charter Township" whereas election df just says "Township". Also, census file says "City City" in some places
- Seems like best bet is to make Area Names consistent, make column with the county fips joined, then use dictionary to match to the Juris code
- Before removing all instances of charter from the censusmi, wanted to make sure df does not contain instances - it doesnt.

In [36]:
df_pivot[df_pivot['city_town_name'].str.contains('charter')]

Unnamed: 0,index,county_code,city_town_code,ward_number,precinct_number,precinct_label,G20PRERTRU,G20PREDBID,G20PRENDEL,G20USSNDER,G20USSTWIL,G20PRETBLA,G20PRELJOR,G20PREGHAW,G20USSGSQU,G20USSDPET,G20USSRJAM,county_name,county_fips,city_town_name


In [37]:
df_pivot[df_pivot['city_town_name'].str.contains('CITY CITY')]

Unnamed: 0,index,county_code,city_town_code,ward_number,precinct_number,precinct_label,G20PRERTRU,G20PREDBID,G20PRENDEL,G20USSNDER,G20USSTWIL,G20PRETBLA,G20PRELJOR,G20PREGHAW,G20USSGSQU,G20USSDPET,G20USSRJAM,county_name,county_fips,city_town_name


## Clean census file to make match column with election file (df_pivot)

In [38]:
censusmi['city_town_name'] = censusmi['Area Name (including legal/statistical area description)'].str.replace('charter township', 'township')
censusmi['city_town_name'] = censusmi['city_town_name'].str.replace('City city', 'city')
censusmi['city_town_name'] = censusmi['city_town_name'].str.upper()

In [44]:
print('City/town names in census file not in df - ', '\n', set(censusmi['city_town_name'].str.upper()) - set(df_pivot['city_town_name']))
print('City/town names in df not in census file - ', '\n', set(df_pivot['city_town_name']) - set(censusmi['city_town_name'].str.upper()))
#Use^ to create dictionary below

City/town names in census file not in df -  
 {'VILLAGE OF CLARKSTON CITY', 'PLEASANTVIEW TOWNSHIP', "L'ANSE TOWNSHIP", 'ST. JAMES TOWNSHIP', 'GUN PLAIN TOWNSHIP', 'COLDSPRINGS TOWNSHIP', 'VILLAGE OF GROSSE POINTE SHORES CITY', 'DEWITT CITY', 'LAGRANGE TOWNSHIP', 'DEWITT TOWNSHIP'}
City/town names in df not in census file -  
 {'ST JAMES TOWNSHIP', 'GUNPLAIN TOWNSHIP', 'LA GRANGE TOWNSHIP', 'COLD SPRINGS TOWNSHIP', 'DE WITT CITY', 'DE WITT TOWNSHIP', 'PLEASANT VIEW TOWNSHIP', 'GROSS POINTE SHORES CITY', 'LANSE TOWNSHIP', 'GROSSE POINTE SHORES CITY', 'CLARKSTON CITY'}


In [48]:
census_to_df_city_dict = {'COLDSPRINGS TOWNSHIP': 'COLD SPRINGS TOWNSHIP',
 'DEWITT CITY': 'DE WITT CITY',
 'DEWITT TOWNSHIP': 'DE WITT TOWNSHIP',
 'GUN PLAIN TOWNSHIP': 'GUNPLAIN TOWNSHIP',
 "L'ANSE TOWNSHIP": 'LANSE TOWNSHIP',
 'LAGRANGE TOWNSHIP': 'LA GRANGE TOWNSHIP',
 'PLEASANTVIEW TOWNSHIP': 'PLEASANT VIEW TOWNSHIP',
 'ST. JAMES TOWNSHIP': 'ST JAMES TOWNSHIP',
 'VILLAGE OF CLARKSTON CITY': 'CLARKSTON CITY',
 'VILLAGE OF GROSSE POINTE SHORES CITY': 'GROSSE POINTE SHORES CITY'}

censusmi.loc[censusmi['city_town_name'].isin(census_to_df_city_dict.keys()), 'city_town_name'] = censusmi.loc[censusmi['city_town_name'].isin(census_to_df_city_dict.keys()), 'city_town_name'].map(census_to_df_city_dict)
censusmi.loc[(censusmi['Area Name (including legal/statistical area description)'] == 'Village of Grosse Pointe Shores city')&(censusmi['County Code (FIPS)']==99), 'city_town_name'] = 'GROSS POINTE SHORES CITY'

In [47]:
#Alternate option to replacing values in census file:
#Reverse dictionary order to match vest's naming conventions which match the census
#However, given that we just want the codes and are using the names just for dictionary to translate, shouldnt matter.
#df_pivot.loc[df_pivot['city_town_name'].isin(census_to_df_city_dict.values()), 'city_town_name'] = df_pivot.loc[df_pivot['city_town_name'].isin(census_to_df_city_dict.values()), 'city_town_name'].map({value : key for (key, value) in census_to_df_city_dict.items()})

## Translate Census file JURISDICTION code in to df

Now that the city town names match, I can create a dictionary - census file juris code to df. First I need a column in each with the county code + the city town name

In [49]:
# column for election results file match
censusmi['county_city_id_df'] = censusmi['County Code (FIPS)'].astype(str).str.zfill(3) + censusmi['city_town_name']
# column for vest file match
censusmi['county_city_id_gdf'] = censusmi['County Code (FIPS)'].astype(str).str.zfill(3) + censusmi['Area Name (including legal/statistical area description)'].str.upper()
# election results column to match census
df_pivot['county_city_id'] = df_pivot['county_fips'] + df_pivot['city_town_name']
# vest file column to match census
gdfv['county_city_id'] = gdfv['COUNTYFIPS']+ gdfv['cousubname'].str.upper()

#**The census identifier has more values by 8 than the df, so just keep in mind when mapping.**

### Apply dictionary to make column in df
county_city_id_to_mcdjuris_dict_df = pd.Series(censusmi['County Subdivision Code (FIPS)'].values, index = censusmi['county_city_id_df']).to_dict()
county_city_id_to_mcdjuris_dict_gdf = pd.Series(censusmi['County Subdivision Code (FIPS)'].values, index = censusmi['county_city_id_gdf']).to_dict()

df_pivot['jurisd'] = (df_pivot['county_city_id'].map(county_city_id_to_mcdjuris_dict_df)).fillna(0).astype('int64')

gdfv['jurisd'] = (gdfv['county_city_id'].map(county_city_id_to_mcdjuris_dict_gdf)).fillna(0).astype('int64')

In [50]:
#confirm if columns match
print('juris in vest file not in election results - ', set(gdfv.jurisd) - set(df_pivot.jurisd))
print('juris in election results file not in vest - ', set(df_pivot.jurisd) - set(gdfv.jurisd))

juris in vest file not in election results -  {0}
juris in election results file not in vest -  {84120}


In [54]:
#create draft id to match gdfv (vest file), df_pivot (election results) and SOS shp that includes ward... 
df_pivot['NAME'] = df_pivot['county_fips'] + df_pivot['jurisd'].astype(str).str.zfill(5) + df_pivot['ward_number'].astype(str).str.zfill(2) + df_pivot['precinct_number'].astype(str).str.zfill(3)+df_pivot['precinct_label'].str.strip()
gdfv['NAME'] = gdfv['COUNTYFIPS'] + gdfv['jurisd'].astype(str).str.zfill(5) + gdfv['PRECINCTID'].str.slice(start = 13)

print('NAME unique value counts: \n shp: ', shp.NAME.nunique(), '\n elections df: ', df_pivot.NAME.nunique(), '\n vest gdf: ', gdfv.NAME.nunique())

#Observe poor match rate for this new draft id
print('# id values in shp not in elections df: ', len(set(shp.NAME) - set(df_pivot.NAME)))
print('# id values in shp not in vest gdf: ', len(set(shp.NAME) - set(gdfv.NAME)))
print('# id values in elections df not in shp: ', len(set(df_pivot.NAME) - set(shp.NAME)))
print('# id values in elections df not in vest gdf: ', len(set(df_pivot.NAME) - set(gdfv.NAME)))
print('# id values in vest gdf not in shp: ', len(set(gdfv.NAME) - set(shp.NAME)))
print('# id values in vest gdf not in elections df: ', len(set(gdfv.NAME) - set(df_pivot.NAME)))

#sum the difference in value counts for all ward #s not equal to zero
print('sum of ward# value count differences not equal to zero: ', )
#See that half of mismatch can likely be attributed to ward mismatch

print('# ids in df not containing AVCB not in shp id list: ', len(set(df_pivot['NAME'][df_pivot['NAME'].str.slice(start = -4)!='AVCB']) - set(shp.NAME)))

NAME unique value counts: 
 shp:  4749 
 elections df:  4889 
 vest gdf:  4756
# id values in shp not in elections df:  64
# id values in shp not in vest gdf:  61
# id values in elections df not in shp:  204
# id values in elections df not in vest gdf:  163
# id values in vest gdf not in shp:  68
# id values in vest gdf not in elections df:  30
sum of ward# value count differences not equal to zero: 
# ids in df not containing AVCB not in shp id list:  70


In [55]:
#Notice 134 values that are AVCBs that need to be redistributed:
df_pivot[df_pivot['NAME'].str.contains('AVCB')]

Unnamed: 0,index,county_code,city_town_code,ward_number,precinct_number,precinct_label,G20PRERTRU,G20PREDBID,G20PRENDEL,G20USSNDER,...,G20PREGHAW,G20USSGSQU,G20USSDPET,G20USSRJAM,county_name,county_fips,city_town_name,county_city_id,jurisd,NAME
3968,4001,82,57,0,1,AVCB,32.0,1463.0,0.0,7.0,...,2.0,6.0,1410.0,37.0,WAYNE,163,DETROIT CITY,163DETROIT CITY,22000,1632200000001AVCB
3969,4002,82,57,0,2,AVCB,131.0,1852.0,1.0,6.0,...,5.0,17.0,1793.0,132.0,WAYNE,163,DETROIT CITY,163DETROIT CITY,22000,1632200000002AVCB
3970,4003,82,57,0,3,AVCB,71.0,1806.0,0.0,4.0,...,2.0,6.0,1769.0,76.0,WAYNE,163,DETROIT CITY,163DETROIT CITY,22000,1632200000003AVCB
3971,4004,82,57,0,4,AVCB,72.0,1776.0,0.0,6.0,...,3.0,12.0,1720.0,76.0,WAYNE,163,DETROIT CITY,163DETROIT CITY,22000,1632200000004AVCB
3972,4005,82,57,0,5,AVCB,56.0,1808.0,0.0,6.0,...,4.0,15.0,1764.0,62.0,WAYNE,163,DETROIT CITY,163DETROIT CITY,22000,1632200000005AVCB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4097,4130,82,57,0,130,AVCB,4.0,263.0,1.0,1.0,...,2.0,1.0,259.0,3.0,WAYNE,163,DETROIT CITY,163DETROIT CITY,22000,1632200000130AVCB
4098,4131,82,57,0,131,AVCB,3.0,136.0,0.0,0.0,...,3.0,1.0,130.0,6.0,WAYNE,163,DETROIT CITY,163DETROIT CITY,22000,1632200000131AVCB
4099,4132,82,57,0,132,AVCB,3.0,112.0,0.0,0.0,...,2.0,2.0,107.0,5.0,WAYNE,163,DETROIT CITY,163DETROIT CITY,22000,1632200000132AVCB
4100,4133,82,57,0,133,AVCB,8.0,310.0,1.0,2.0,...,1.0,3.0,306.0,7.0,WAYNE,163,DETROIT CITY,163DETROIT CITY,22000,1632200000133AVCB


## AVCBs
From 2016 notes to apply to 2020 potentially:
1. Deal with non-Detroit (Wayne County) AVCBS - these votes need to be added directly to the precincts they correspond to
2. Deal with Detroit AVCBs - these need to be allocated based on a precincts share of votes within the AVCB it is covered by
3. Deal with statistical adjustments - these apply to all precincts and can be dealt with all at once
    - Just removed for 2020?
    
## Allocate Wayne AVCB's using 2020 csv file

Wayne: Detroit reports absentee votes via Absent Voter Counting Boards. The linkage file of boards to precincts is from [OpenElections)] (https://github.com/openelections/openelections-sources-mi/tree/master/2020), and votes were distributed to precincts proportionally by election day vote.

In [56]:
#Split up the AVCB, statistical adjustments, and precinct level votes
avcb = df_pivot[df_pivot['precinct_label'].map(lambda x: 'AVCB' in str(x))]
df_no_avcb = df_pivot[(df_pivot['precinct_label'].map(lambda x: 'AVCB' not in str(x)))]

#Make sure the bottom 3 numbers add up to the top number so every precinct is being accounted for
print('election result df size: ', df_pivot.shape)
print('avcb only from election results df size: ', avcb.shape)
print('election df no avcbs size: ', df_no_avcb.shape)
print('vest gdf size: ', gdfv.shape)

election result df size:  (4889, 23)
avcb only from election results df size:  (134, 23)
election df no avcbs size:  (4755, 23)
vest gdf size:  (4756, 19)


In [57]:
avcb_wayne = avcb[avcb["county_name"]=="WAYNE"]
avcb_not_wayne = avcb[avcb["county_name"]!="WAYNE"]

print(avcb.shape)
print(avcb_wayne.shape)
print(avcb_not_wayne.shape)

#Notice that all avcbs in 2020 file seem to be in WAYNE?

(134, 23)
(134, 23)
(0, 23)


In [58]:
#Read in open elections avcb file
avcb_csv = pd.read_csv('./raw_from_source/Detroit AVCBs by precinct (Nov. 2020).csv', names=['eday_prec', 'avcb_prec'])
avcb_csv_dict = dict(zip(avcb_csv['eday_prec'], avcb_csv['avcb_prec']))
print('open elections avcb df shape: ', avcb_csv.shape)

open elections avcb df shape:  (504, 2)


In [59]:
precincts_detroit = df_no_avcb[df_no_avcb["city_town_name"]=="DETROIT CITY"]
precincts_not_detroit = df_no_avcb[df_no_avcb["city_town_name"]!="DETROIT CITY"]
#Define a new column called avcb to store what avcb a precinct is covered by, set it to 0 to start
precincts_detroit["avcb"]=0

#Use the dictionary above, which maps from precinct number to avcb, to find the avcb for each precinct
precincts_detroit.loc[:,"avcb"] = precincts_detroit["precinct_number"].astype(str).map(avcb_csv_dict).fillna(precincts_detroit["avcb"])

#The number of avcbs that have been mapped to, should equal the number of avcbs (172)
print('# avcbs that have been mapped:', len(precincts_detroit["avcb"].unique()))

print('avcb_wayne shape: ', avcb_wayne.shape)

# avcbs that have been mapped: 134
avcb_wayne shape:  (134, 23)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  precincts_detroit["avcb"]=0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [60]:
#Checked that all rows were accounted in for in splitting the dataframe up

#What we started with
print('df_pivot from pre avcb manipulation shape: ', df_pivot.shape)

#Will be used as part of allocation
print('avcb only df shape: ' , avcb.shape)

#Where the allocated votes will be added to
print('not detroit precincts shape: ', precincts_not_detroit.shape)
print('detroit precincts shape: ', precincts_detroit.shape)
print('sum of parts: ', avcb.shape[0]+ precincts_not_detroit.shape[0]+ precincts_detroit.shape[0])

df_pivot from pre avcb manipulation shape:  (4889, 23)
avcb only df shape:  (134, 23)
not detroit precincts shape:  (4252, 23)
detroit precincts shape:  (503, 24)
sum of parts:  4889


In [61]:
avcb_wayne["avcb"] = avcb_wayne['precinct_number']

## Reallocate wayne county avcb votes
to_dole_out_totals = avcb_wayne.groupby(["avcb"]).sum()

precinct_specific_totals = precincts_detroit.groupby(["avcb"]).sum()
precinct_specific_totals.reset_index(inplace=True, drop=False)
to_dole_out_totals.reset_index(inplace=True, drop=False)

to_dole_out_totals["avcb"] = to_dole_out_totals["avcb"].astype(str)
precinct_specific_totals["avcb"] = precinct_specific_totals["avcb"].astype(str)

In [62]:
#Create some new columns for each of these races to deal with the allocation
for race in column_list:
    add_var = race+"_add"
    rem_var = race+"_rem"
    floor_var = race+"_floor"
    precincts_detroit.loc[:,add_var]=0.0
    precincts_detroit.loc[:,rem_var]=0.0
    precincts_detroit.loc[:,floor_var]=0.0
    
#Iterate over the rows
#Note this function iterates over the dataframe two times so the rounded vote totals match the totals to allocate

for index, row in precincts_detroit.iterrows():
    for race in column_list:
        add_var = race+"_add"
        rem_var = race+"_rem"
        floor_var = race+"_floor"
        #Grab the district
        county_id = row["avcb"]
        #Get the denominator for the allocation (the precinct vote totals)
        denom = precinct_specific_totals.loc[precinct_specific_totals["avcb"]==county_id][race]
        
        if county_id in list(to_dole_out_totals['avcb']):
            #Get one of the numerators, how many district-wide votes to allocate
            numer = to_dole_out_totals.loc[to_dole_out_totals["avcb"]==county_id][race]
            #Get the vote totals for this race in this precinct
            val = precincts_detroit.at[index,race]
            #Get the vote share, the precincts % of total precinct votes in the district times votes to allocate
            if ((float(denom)==0)):
                vote_share = 0
            else:
                vote_share = (float(val)/float(denom))*float(numer)
            precincts_detroit.at[index,add_var] = vote_share
            #Take the decimal remainder of the allocation
            precincts_detroit.at[index,rem_var] = vote_share%1
            #Take the floor of the allocation
            precincts_detroit.at[index,floor_var] = np.floor(vote_share)
            
#After the first pass through, get the sums of the races by district to assist in the rounding            
first_allocation = pd.DataFrame(precincts_detroit.groupby(["avcb"]).sum())
first_allocation.reset_index(inplace=True,drop=False)

#Now we want to iterate district by district to work on rounding
county_list = list(to_dole_out_totals["avcb"].unique()) 

#Iterate over the district
for county in county_list:
    for race in column_list:
        add_var = race+"_add"
        rem_var = race+"_rem"
        floor_var = race+"_floor"
        #County how many votes still need to be allocated (because we took the floor of all the initial allocations)
        to_go = int(np.round((int(to_dole_out_totals.loc[to_dole_out_totals["avcb"]==county][race])-int(first_allocation.loc[first_allocation["avcb"]==county,floor_var]))))
        #Grab the n precincts with the highest remainders and round these up, where n is the # of votes that still need to be allocated
        for index in precincts_detroit.loc[precincts_detroit["avcb"]==county][rem_var].nlargest(to_go).index:
            precincts_detroit.at[index,add_var] = np.ceil(precincts_detroit.at[index,add_var])
            
#Iterate over every race again
for race in column_list:
    add_var = race+"_add"
    #Round every allocation down to not add fractional votes
    precincts_detroit.loc[:,add_var]=np.floor(precincts_detroit.loc[:,add_var])
    precincts_detroit.loc[:,race]+=precincts_detroit.loc[:,add_var]
        
#Print out any instances where the allocation, as written, won't work
for index, row in precinct_specific_totals.iterrows():
    for race in column_list:
        if (row[race]==0):
            race_district = row["avcb"]
            to_allocate = int(to_dole_out_totals.loc[to_dole_out_totals["avcb"]==race_district,race])
            if (to_allocate != 0):
                print("Need to allocate "+str(to_allocate)+" vote(s) manually in " +str(race)+" avcb "+str(race_district))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Need to allocate 3 vote(s) manually in G20PRELJOR avcb 1
Need to allocate 2 vote(s) manually in G20PREGHAW avcb 10
Need to allocate 3 vote(s) manually in G20USSGSQU avcb 100
Need to allocate 1 vote(s) manually in G20USSNDER avcb 100
Need to allocate 1 vote(s) manually in G20USSTWIL avcb 100
Need to allocate 1 vote(s) manually in G20PRELJOR avcb 103
Need to allocate 1 vote(s) manually in G20USSGSQU avcb 103
Need to allocate 6 vote(s) manually in G20PREDBID avcb 105
Need to allocate 6 vote(s) manually in G20USSDPET avcb 105
Need to allocate 1 vote(s) manually in G20PREGHAW avcb 107
Need to allocate 1 vote(s) manually in G20PRENDEL avcb 107
Need to allocate 1 vote(s) manually in G20USSNDER avcb 107
Need to allocate 1 vote(s) manually in G20PREGHAW avcb 108
Need to allocate 1 vote(s) manually in G20USSGSQU avcb 108
Need to allocate 2 vote(s) manually in G20USSNDER avcb 108
Need to allocate 3 vote(s) manually in G20USSGSQU avcb 109
Need to allocate 1 vote(s) manually in G20USSTWIL avcb 109
