In [1]:
import os
import pandas as pd
import geopandas as gp
from collections import Counter
wd = os.path.join(os.getcwd(),'raw-from-source')

# Arkansas_2022_Primary_Tournout_2020_VTD_02_03_2023

## Background:
- We received a request for voter registration by party and voter turnout by primary contest (Republican and Democratic) for the 2022 Primary Election on May 24th, 2022. 

## Approach:
- Load in individual level L2 voter file data for Arkansas, block shapefile, and VTD block assignment file.
- Make point shapefile out of lat,long fields in voter file and spatial join with block shapefile.
- Clean voter file block assignments for those that receive multiple assignments (falling on a line between two or more blocks) by selecting the first block in ascending order.
- Join VTD BAF with individual level voter file data using block assignment.
- Create a pivot table grouping by VTD assignment for party registration to create fields containing counts for each party
- Repeat the pivot table process for ballot type in the 2022 primary.
- Join the two pivot tables together and clean by renaming columns.
- Provide statistics on the number of voters dropped in the aggregated file because they lack lat,long coordinates. 
- Export the file as a CSV.

## Links to datasets used:
- L2 Individaul Level Voter File for Arkansas 2022 Primary
- [VTD Block Assignment file from the US Census](https://www2.census.gov/geo/docs/maps-data/data/baf2020/BlockAssign_ST05_AR.zip)
- [Arkansas Block boundaries (2020)](https://redistrictingdatahub.org/dataset/arkansas-block-boundaries-2020/)

For a 'raw-from-source' file containing block shapefile and VTD BAF, contact info@redistrictingdatahub.org. Individual level voter file data cannot be provided.

Read in individual level voter file data and create GEOID field

In [2]:
df = pd.read_csv(os.path.join(wd,'ar_data.csv'))
df['L2GEOID'] = df['geoid20'].apply(lambda x: '05'+str(str(x).zfill(13)))

Read in block level shapefile for Arkansas 

In [3]:
b = gp.read_file(os.path.join(wd,'ar_b_2020_bound.shp'))
b = b[['COUNTYFP20','GEOID20','geometry']]
b.head()

Unnamed: 0,COUNTYFP20,GEOID20,geometry
0,51,50510117021003,"POLYGON ((-93.05256 34.48813, -93.05255 34.488..."
1,51,50510120011005,"POLYGON ((-93.00858 34.68780, -93.00845 34.687..."
2,99,50990902001031,"POLYGON ((-93.38250 33.80097, -93.38165 33.801..."
3,51,50510108002011,"POLYGON ((-93.07098 34.51329, -93.07096 34.513..."
4,51,50510110021007,"POLYGON ((-93.14066 34.52243, -93.14063 34.522..."


Read in block to VTD assignment file and make unique field that combines county and VTD

In [4]:
baf = pd.read_csv(os.path.join(wd,'BlockAssign_ST05_AR_VTD.txt'),delimiter='|')
baf['GEOID20']= baf['BLOCKID'].apply(lambda x: str(x).zfill(15))
baf['COUNTYFP20'] = baf['COUNTYFP'].apply(lambda x: str(x).zfill(3))
baf['VTD20'] = baf['COUNTYFP20']+'-'+baf['DISTRICT']
baf = baf[['GEOID20','VTD20']]
baf.head()

Unnamed: 0,GEOID20,VTD20
0,50014801001000,001-000013
1,50014801001001,001-000013
2,50014801001002,001-000013
3,50014801001003,001-000013
4,50014801001004,001-000013


Make point shapefile out of lat,long fields in individual level voter file and then join block shapefile

In [5]:
df['lat'] = df['lat'].astype(float)
df['lon'] = df['lon'].astype(float)
points = gp.GeoDataFrame(df,geometry = gp.points_from_xy(df.lon, df.lat, crs=b.crs))
joined = points.sjoin(b,how='left')
joined.drop(columns = ['lat','lon','state','geoid20','index_right'],inplace=True)

Determine the number of voters who receive multiple block assignments (e.g. falling exactly on a line between two or more blocks)

In [6]:
d = Counter(list(joined['lalvoterid']))
dup_ids = []
for k,v in d.items():
    if v!=1:
        dup_ids.append(k)
print('Number of voters with duplicate assignments: ', len(dup_ids))

Number of voters with duplicate assignments:  324


Determine how many voters could potentially be aggregated to different VTDs (e.g. they have multiple block assignments and one or more of those blocks is in a different VTD)

In [7]:
dups = joined[joined['lalvoterid'].isin(dup_ids)]

dups_dict = {}
for i in dups['lalvoterid'].unique():
    mini = dups[dups['lalvoterid']==i]
    dups_dict.update({i:list(mini['GEOID20'])})
    
counter = 0
for k,v in dups_dict.items():
    vtds = []
    hold = list(set(baf[baf['GEOID20'].isin(v)]['VTD20']))
    if len(hold)>1:
        counter = counter+1

print('Number of voters who could be aggregated to a different VTD: ', str(counter))

Number of voters who could be aggregated to a different VTD:  33


Assign voters to the block that comes first when sorted (for the purposes of replicability and standardization) 

In [8]:
joined.sort_values(by='GEOID20',inplace=True)
no_dups = joined.groupby('lalvoterid').first()

Verify that that all voters with at least one block assignment are kept in the file

In [9]:
tot = 0
for k,v in dups_dict.items():
    len_v = len(v)
    tot = len_v+tot
print('Are all voters with at least one block assignment maintained? ', str((tot-(len(joined)-len(no_dups)) == len(dup_ids))))

Are all voters with at least one block assignment maintained?  True


Clean the voter file and merge with the VTD block assignment file

In [10]:
no_dups.reset_index(inplace=True,drop=False)
no_dups_cleaned = no_dups[['parties_description','ball_type','GEOID20']]
merged = pd.merge(no_dups_cleaned,baf,on='GEOID20',how='inner')

Create pivot table for ballot type selected in the primary (REP or DEM)

In [11]:
merged['counts']=1
merged['ball_type'] = merged['ball_type'].fillna('No Primary Vote')
ball_type = merged.pivot_table(index='VTD20',columns = ['ball_type'],values=['counts'],aggfunc=sum)
for i in ball_type.columns:
    ball_type[i] = ball_type[i].fillna(0)
ball_type.reset_index(drop=False,inplace=True)
ball_type.columns = ball_type.columns.droplevel(0)
ball_type.rename(columns = {ball_type.columns[0]:"VTD20",'D':'DEM_PRIM_VOTE22','R':'REP_PRIM_VOTE22','O':'OTH_PRIM_VOTE22'}, inplace = True)
ball_type.drop(columns = 'No Primary Vote',inplace=True)
ball_type.head()

ball_type,VTD20,DEM_PRIM_VOTE22,OTH_PRIM_VOTE22,REP_PRIM_VOTE22
0,001-000011,3.0,0.0,25.0
1,001-000012,2.0,0.0,17.0
2,001-000013,20.0,0.0,129.0
3,001-000014,2.0,0.0,50.0
4,001-000015,3.0,0.0,44.0


Create pivot table for modeled registered voters

In [12]:
parties = merged.pivot_table(index='VTD20',columns = ['parties_description'],values=['counts'],aggfunc=sum)
for i in parties.columns:
    parties[i] = parties[i].fillna(0)
parties.reset_index(drop=False,inplace=True)
parties.columns = parties.columns.droplevel(0)
parties.rename(columns = {parties.columns[0]:"VTD20",'Democratic':'DEM_PRIM_REG22','Republican':'REP_PRIM_REG22','Green':'GRE_PRIM_REG22','Libertarian':'LIB_PRIM_REG22','Non-Partisan':'NON_PRIM_REG22','Reform':'REF_PRIM_REG22'}, inplace = True)
parties.head()

parties_description,VTD20,DEM_PRIM_REG22,GRE_PRIM_REG22,LIB_PRIM_REG22,NON_PRIM_REG22,REF_PRIM_REG22,REP_PRIM_REG22
0,001-000011,9.0,0.0,0.0,17.0,0.0,27.0
1,001-000012,13.0,0.0,0.0,17.0,0.0,25.0
2,001-000013,141.0,0.0,0.0,108.0,0.0,173.0
3,001-000014,21.0,0.0,0.0,43.0,0.0,78.0
4,001-000015,22.0,0.0,0.0,39.0,0.0,68.0


Join the modeled party and ballot type dataframes together

In [13]:
joined_vtd = pd.merge(ball_type,parties,on='VTD20',how='outer',indicator=True)
print('VTD join successful? ', str(len(joined_vtd[joined_vtd['_merge']!='both'])==0))
joined_vtd.drop(columns = '_merge',inplace=True)
joined_vtd.head()

VTD join successful?  True


Unnamed: 0,VTD20,DEM_PRIM_VOTE22,OTH_PRIM_VOTE22,REP_PRIM_VOTE22,DEM_PRIM_REG22,GRE_PRIM_REG22,LIB_PRIM_REG22,NON_PRIM_REG22,REF_PRIM_REG22,REP_PRIM_REG22
0,001-000011,3.0,0.0,25.0,9.0,0.0,0.0,17.0,0.0,27.0
1,001-000012,2.0,0.0,17.0,13.0,0.0,0.0,17.0,0.0,25.0
2,001-000013,20.0,0.0,129.0,141.0,0.0,0.0,108.0,0.0,173.0
3,001-000014,2.0,0.0,50.0,21.0,0.0,0.0,43.0,0.0,78.0
4,001-000015,3.0,0.0,44.0,22.0,0.0,0.0,39.0,0.0,68.0


In [14]:
print('Original number of registered voters in L2 file: ', str(len(df)))
print('Number of registered voters that get aggregated to VTDs: ', str(len(merged)))
print('Number of registered voters in L2 file that did not get aggregated to VTDs: ', str(len(df)-len(merged)))

Original number of registered voters in L2 file:  1593326
Number of registered voters that get aggregated to VTDs:  1574512
Number of registered voters in L2 file that did not get aggregated to VTDs:  18814


In [15]:
original_rep_prim_voters = len(df[df['ball_type']=='R'])
rep_voters_agg = joined_vtd['REP_PRIM_VOTE22'].sum()

In [16]:
print('Original number of Republican primary voters in L2 file: ', str(original_rep_prim_voters))
print('Number of Republican primary voters that get aggregated to VTDs: ', str(rep_voters_agg))
print('Number of Republican primary voters in L2 file that did not get aggregated to VTDs: ', str(original_rep_prim_voters-rep_voters_agg))

Original number of Republican primary voters in L2 file:  343639
Number of Republican primary voters that get aggregated to VTDs:  338999.0
Number of Republican primary voters in L2 file that did not get aggregated to VTDs:  4640.0


In [17]:
original_dem_prim_voters = len(df[df['ball_type']=='D'])
dem_voters_agg = joined_vtd['DEM_PRIM_VOTE22'].sum()

In [18]:
print('Original number of Democractic primary voters in L2 file: ', str(original_dem_prim_voters))
print('Number of Democratic primary voters that get aggregated to VTDs: ', str(dem_voters_agg))
print('Number of Democratic primary voters in L2 file that did not get aggregated to VTDs: ', str(original_dem_prim_voters-dem_voters_agg))

Original number of Democractic primary voters in L2 file:  94054
Number of Democratic primary voters that get aggregated to VTDs:  93269.0
Number of Democratic primary voters in L2 file that did not get aggregated to VTDs:  785.0


Extract data as a CSV

In [19]:
joined_vtd.to_csv('./ar_prim_turnout_2022_2020_vtd.csv',index=False)