In [198]:
import geopandas as gp
import pandas as pd
from collections import Counter
import numpy as np
from zipfile import ZipFile
import zipfile
import os

# Alabama_Precincts_VF_06_15_23

## Background:
- Received a request for a block assignment file linking census blocks to Alabama's post-redistricting precincts.

## Approach:
- Load in a geocoded voterfile.
- Remove voters without a precinct assignment and those for which the county of the geocoded block does not match the county tied to their record (2,443 individuals).
- Append the 3-digit County FIPs to the precinct name.
- Group the voterfile by block geoid20, recording the all the different precinct names, and the number of times they occur, for every individual geocoded to a particular block.
- For every block, iterate over the precinct names and counts values and assign the "best_prec" to the precinct that occurs most frequently. Ties are broken by using the first occurring precinct.
- Add in remaining block geoid20s for the state, fill other fields for these blocks as "NO VOTERS IN VOTERFILE".
- Clean column names and export to csv.

## Links to datasets used:
- L2 Individaul Level Voter File for Alabama (2023)
- [Alabama Block-Level PL File](https://redistrictingdatahub.org/dataset/alabama-block-pl-94171-2020-by-table/)

Contact info@redistrictingdatahub.org with any questions. Individual level voter file data cannot be provided.

In [190]:
VF_PATH = "/Users/peterhorton/Downloads/AL_individual_file000.gz"
BLOCK_PATH = "/Users/peterhorton/Documents/RDH/raw_data/census/2020_PL_csv/al_pl2020_b/al_pl2020_b.csv"

## Load and prepare voterfile data

In [192]:
# Read in Voter File
alabama_vf = pd.read_csv(VF_PATH, compression='gzip')

# Filter out individuals without a geoid or precinct assignment
print(alabama_vf[alabama_vf["geoid20"]=="NO ASSIGNMENT"].shape)
alabama_vf = alabama_vf[alabama_vf["geoid20"]!="NO ASSIGNMENT"]
print(alabama_vf[alabama_vf["precinct"].isna()].shape)
alabama_vf = alabama_vf[~alabama_vf["precinct"].isna()]

# Get the county assigned from the geoid
alabama_vf["Assigned_CNTY"] = alabama_vf["geoid20"].astype(str).str[2:5]

# Clean the county data from the voter file itself
alabama_vf["co_fips"] = alabama_vf["co_fips"].astype(str).str.zfill(3)

# Remove individuals for whom these county assignments don't match (good sign that the geocode isn't accurate)
alabama_vf = alabama_vf[alabama_vf["Assigned_CNTY"]==alabama_vf["co_fips"]]

# Append the county to the precinct
alabama_vf["precinct"] = alabama_vf["Assigned_CNTY"] +"-"+alabama_vf["precinct"]

(33356, 13)
(6291, 13)


## Go from individual level to block level

In [193]:
def best_assign(counter):
    '''
    Function to iterate a counter and return the first key with the highest value
    '''
    max_val = 0
    most = ""
    for key,val in counter.items():
        if val > max_val:
            most = key
    return key

# Groupby block geoid and keep track of the number of different precincts for which individuals in that block are assigned to
alabama_blocks = pd.DataFrame(alabama_vf.groupby("geoid20")["precinct"].agg(Counter).reset_index(name='counter'))

# Get the length of precincts for which there are assignments for each block
alabama_blocks["num_precs"] = alabama_blocks["counter"].apply(lambda x: len(x))

# Select the precinct for which the most individuals are assigned to for each block
alabama_blocks["best_prec"] = alabama_blocks["counter"].apply(lambda x: best_assign(x))

## Add remaining blocks for which there is no data

In [194]:
# Load in block csv to add in remaining blocks
al_blocks = pd.read_csv(BLOCK_PATH, dtype = str)

# Filter file and clean columns for join
al_blocks = al_blocks[["GEOCODE"]]
al_blocks["GEOCODE"] = al_blocks["GEOCODE"].astype(str).str.zfill(15)
al_blocks.rename(columns = {"GEOCODE":"geoid20"}, inplace = True)

# Merge to add remaining blocks, filter columns, fillna values
joined = pd.merge(al_blocks, alabama_blocks, how = "left", on = "geoid20")
joined = joined[["geoid20","best_prec","num_precs","counter"]]
joined = joined.fillna("NO VOTERS IN VOTERFILE")

# Rename columns
joined.rename(columns = {"num_precs":"unq_precs", "counter":"prec_count"}, inplace = True)

# Export to csv
joined.to_csv("./al_20230403_vf_precs/al_20230403_vf_precs.csv", index = False)

## Zip the File

In [202]:
def zip_folder(name, readme_path, list_of_file_paths):
    print(name)
    dir_path = os.path.dirname(os.path.realpath(list_of_file_paths[0]))
    print(dir_path)
    root = dir_path
    zipObj = ZipFile(name, 'w',zipfile.ZIP_DEFLATED) #instantiate the zip object 
    readme = readme_path
    readme_dir = os.path.dirname(readme) 
    readme_root = os.path.dirname(os.path.realpath(readme))
    zipObj.write(readme,os.path.relpath(readme,readme_root))
    
    for i in list_of_file_paths:
        zipObj.write(i, os.path.relpath(i,root))
    zipObj.close()

    with open(readme, 'r') as r:
        print(r.read())
    r.close()
    
zip_folder("al_20230403_vf_precs.zip", "./al_20230403_vf_precs/README.txt", ["./al_20230403_vf_precs/al_20230403_vf_precs.csv"])

al_20230403_vf_precs.zip
/Users/peterhorton/Documents/RDH/Support/Processing-Requests/Alabama_Precincts_VF_06_15_23/al_20230403_vf_precs
Alabama 2022 Precinct Estimates Using April 2023 Voterfile

## RDH Date Retrieval
Voterfile from 04/03/2023 Processed June 2023

## Sources
L2

## Fields:
Field Name Description                                                                               
geoid20    Block Identifier                                                                          
best_prec  Best precinct assignment                                                                  
unq_precs  Number of unique precincts for which voters geocoded to the block have listed             
prec_count Count of voters listed for each precinct in which a person has been geocoded to this block

## Processing Steps
Visit the RDH GitHub and the processing script for this code [here](https://github.com/nonpartisan-redistricting-datahub/Processing-Requests)

Load in a geocoded voterfile.
Remo