In [1]:
import pandas as pd
import geopandas as gp
import numpy as np 
import os
import fiona
from statistics import mean, median
import string
import re
from collections import Counter
from matplotlib.lines import Line2D

In [2]:
pd.set_option('display.max_columns', 500)

In [3]:
'''Following Docstring Convention: https://www.python.org/dev/peps/pep-0257/'''

import pandas as pd
import os
#THESE SETTINGS ESSENTIAL TO HAVE THE FIELDS TABLE SHOW UP CORRECTLY in the readme
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

def create_fields_table(fields_dict):
    '''Fields table used in readme'''
    fields_table_data = {'Field Name': list(fields_dict.keys()), 'Description': list(fields_dict.values())}
    fields_table = pd.DataFrame(fields_table_data)
    return fields_table

def erj_readme_template(fields_dict, stateabrv, state, year, election_type, additional_races, retrieval_date, upload_date, vest_file_link, 
                        raw_data_source, state_erj_repo, office_codes, additional_notes=' '):
    '''erj_readme_template variable explanations:
    fields_dict = used to create the fields table for the non-standardized/race fields fields. 
                Key is the field/value is the field description
    stateabrv = two character state abbreviation capitalized, 
    state = state name, first letter capitalized, 
    year = election year (XXXX), 
    election_type = general, primary, special or runoff, 
    additional_races = the races that RDH added to the original vest file (not in VEST's og file), 
    retrieval_date = date RDH retrieved VEST file, 
    upload_date = date RDH uploads ERJ file, 
    vest_file_link = link to dataverse page for VEST's precinct boundary and election results file, 
    raw_data_source = site description and link, 
    state_erj_repo = link to erj github repository for given state
    office_codes = codes used broken off of field names for easy viewing
    additional_notes = default set to empty, but fill in with string where applicable.
    '''
    #Generalized readme text
    readme_p1 = '''{year} {stateabrv} {election_type} election results
## RDH Date Retrieval
{retrieval_date}
## RDH Upload Date
{upload_date}
## Sources
The RDH retrieved the VEST {year} {election_type} precinct boundary and election results shapefile from [VEST's Harvard Dataverse]({vest_file_link})
The RDH retrieved raw {year} {election_type} election results from {raw_data_source}
## Notes on Field Names (adapted from VEST):
Columns reporting votes generally follow the pattern: 
One example is:
G16PREDCLI
The first character is G for a general election, P for a primary, S for a special, and R for a runoff.
Characters 2 and 3 are the year of the election.*
Characters 4-6 represent the office type (see list below).
Character 7 represents the party of the candidate.
Characters 8-10 are the first three letters of the candidate's last name.
To fit within the GIS 10 character limit for field names, the naming convention is slightly different for the State Legislature and 
US House of Representatives. All fields are listed below with definitions.
Office Codes Used:
{office_codes}
## Fields:
    '''.format(stateabrv= stateabrv, state= state, year=year, election_type=election_type, additional_races=additional_races,retrieval_date=retrieval_date,
    upload_date=upload_date, vest_file_link=vest_file_link, raw_data_source=raw_data_source, state_erj_repo=state_erj_repo, office_codes=office_codes)

    fields_table = create_fields_table(fields_dict)
    readme_p2 = str(fields_table.to_string(index = False))

    readme_p3='''
    
## Processing Steps
    
The RDH joined additional election results to VEST's existing precinct shapefile, including {additional_races} using Python.
For more information on the processing completed, visit our [Github repository]({state_erj_repo}) for Election Result Joins (ERJ) for {state}.
## Additional Notes
{additional_notes}
Please contact info@redistrictingdatahub.org for more information.
    '''.format(stateabrv=stateabrv, state=state, year=year, election_type=election_type, additional_races=additional_races, state_erj_repo=state_erj_repo, office_codes=office_codes, additional_notes = additional_notes)
    
    full_readme = str(readme_p1)+str(readme_p2)+str(readme_p3)
    return full_readme

def export_readme(readme_name, state, election_type, full_readme_text):
    '''readme_name must include file path to readme within erj folder
    ex: 
    readme_name = './az_gen_20_prec/readme.txt'
    '''
    with open(readme_name, 'x') as tf:
        tf.write(full_readme_text)
    print(state, election_type, " readme moved to folder")

In [4]:
def export_readme(readme_name, state, election_type, full_readme_text):
    '''readme_name must include file path to readme within erj folder
    ex: 
    readme_name = './az_gen_20_prec/readme.txt'
    '''
    if os.path.exists(readme_name):
        os.remove(readme_name)
    with open(readme_name, 'x') as tf:
        tf.write(full_readme_text)
    print(state, election_type, " readme moved to folder")

In [5]:
fields_dict = {}

In [8]:
import os

fields_dict = {}

field_csv = pd.read_csv("./primary_fields_dict.csv")
fields_dict = dict(zip(field_csv["Field_Name"], field_csv["Cand_Name"]))

In [9]:
fields_dict

{'P20PREDBEN': 'Michael Bennet',
 'P20PREDBID': 'Joseph R. Biden',
 'P20PREDBLO': 'Michael R. Bloomberg',
 'P20PREDBUT': 'Pete Buttigieg',
 'P20PREDDEL': 'John K. Delaney',
 'P20PREDGAB': 'Tulsi Gabbard',
 'P20PREDKLO': 'Amy Klobuchar',
 'P20PREDPAT': 'Deval Patrick',
 'P20PREDSAN': 'Bernie Sanders',
 'P20PREDSTE': 'Tom Steyer',
 'P20PREDWAR': 'Elizabeth Warren',
 'P20PREDYAN': 'Andrew Yang',
 'P20PRERTRU': 'Donald J. Trump (I)',
 'P20PSCDBLA': 'Daniel Blackman',
 'P20PSCDBRY': 'Robert G. Bryant',
 'P20PSCDNOE': 'John Noel',
 'P20PSCRMCD': 'Lauren Bubba McDonald, Jr. (I)',
 'P20PSCRSHA': 'Jason Shaw (I)',
 'P20USSDAMI': 'Sarah Riggs Amico',
 'P20USSDDEJ': 'Marckeith DeJesus',
 'P20USSDKNO': 'James Knox',
 'P20USSDMCC': 'Tricia Carpenter McCracken',
 'P20USSDOSS': 'Jon Ossoff',
 'P20USSDSMI': 'Maya Dillard Smith',
 'P20USSDTOM': 'Teresa Pike Tomlinson',
 'P20USSRPER': 'David A. Perdue (I)',
 'PCON01DGRI': 'Joyce Marie Griggs',
 'PCON01DRIN': 'Lisa M. Ring',
 'PCON01DSEI': 'Barbara Seidm

In [10]:
fields_dict["UNIQUE_ID"] = 'Unique Precinct Identifier'
fields_dict['COUNTYFP'] = 'County FIPs Code'
fields_dict['DISTRICT'] = 'Precinct ID Combining FIPs and Identifier'
fields_dict['CTYSOSID'] = 'Precinct ID Combining FIPs and Identifier'
fields_dict['PRECINCT_I'] = 'Precinct Identifier'
fields_dict['PRECINCT_N'] = 'Precinct Name'
fields_dict['CTYNAME'] = 'County Name'
fields_dict['CTYNUMBER'] = 'County Number'
fields_dict['CTYNUMBER2'] = 'County Number'
fields_dict['FIPS2'] = 'County FIPs Code'
fields_dict['CONG_DIST'] = 'Congressional District'
fields_dict['SLDL_DIST'] = 'State House District'
fields_dict['SLDU_DIST'] = 'State Senate District'

In [12]:
stateabrv = "GA"
state = "Georgia"
year = "2020"
election_type = "primary / primary runoff"
additional_races = '''President, U.S. Senate, Public Service Commission, U.S. House, State House, State Senate'''
retrieval_date = "01/24/2022"
upload_date = "06/03/2022" 
vest_file_link = "https://dataverse.harvard.edu/file.xhtml?fileId=4863164&version=35.0"
raw_data_source = "Election results from the Georgia Secretary of State Elections Division (https://sos.ga.gov/index.php/Elections/current_and_past_elections_results) and precinct shapefiles from VEST's Georgia 2020 file for all counties but Bibb County, where a shapefile from their GIS office is used (https://www.arcgis.com/home/item.html?id=a7f37f9db50d4948b147144e4ac46f6b).Clayton County uses a portion of the 2018 redistricting shapefile from the Georgia General Assembly Reapportionment Office (https://www.legis.ga.gov/joint-office/reapportionment). Sources used to explain precinct shapefile changes are referenced in the Jupyer notebook on Github." 
state_erj_repo = "https://github.com/nonpartisan-redistricting-datahub/erj-ga"
office_codes = '''
PRE - President of the United States
USS - U.S. Senate
CON - U.S. House
PSC - Public Service Commission
CON## - U.S. Congress
SU##  - State Legislative Upper
SL###  - State Legislative Lower
'''
                
additional_notes = '''
~all files~

'''


In [13]:
full_readme = erj_readme_template(fields_dict, stateabrv, state, year, election_type, additional_races, retrieval_date, upload_date, vest_file_link, 
                        raw_data_source, state_erj_repo, office_codes, additional_notes)

In [15]:
export_readme("./ga_prim_20_prec/README.txt", "Georgia", "primary", full_readme)

Georgia primary  readme moved to folder


In [9]:
from zipfile import ZipFile

# more fine-grained control over ZIP files
with ZipFile("oh_gen_20_prec.zip", "w") as newzip:
    for val in os.listdir("./oh_gen_20_prec/"):
        newzip.write("./oh_gen_20_prec/"+val)


In [16]:
import geopandas as gp
import pandas as pd
import os
import re

In [17]:
level_race_name_dict = {"CONG_DIST":"CON",
    "SLDL_DIST":"SL",
    "SLDU_DIST":"SU"}

In [20]:
import geopandas as gp
import pandas as pd
import os
import re
level_race_name_dict = {"CONG_DIST":"CON",
    "SLDL_DIST":"SL",
    "SLDU_DIST":"SU"}
def district_assignment_errors(election_results_df, level):
    '''
    This function will check whether the votes in a precinct match the district assignment
    Note: As written, this will only work for fully numeric district assignments
    '''
    
    # This boolean is used to help clean up the print statements
    any_error = False
    # Convert from the name of the district assignment column, to how it is referred to in column names
    finding = level_race_name_dict[level]
    # Iterate over the dataframe row-by-row
    for index,row in election_results_df.iterrows():
        # Get the district assignment for that row
        district_assignment = row[level]
        # Iterate over every row column by column
        for val in row.index:
            # If the name for the type of district is in the column name and there are non-zero votes
            if finding in val and row[val] != 0 and val!= level and val not in ["SLDU_DIST","SLDL_DIST","CONG_DIST"]:
                # Grab the numbers for district assignments
                regex_string = finding+'\d*'
                # The len(finding) part is needed here as sometimes there is more than one digit to the district
                col_district = re.findall(regex_string, val)[0][len(finding):]
                # Makes sure that "CON", "SL", or "SU" wasn't found in a name (more than 5 characters into the col name)
                if (val.find(finding) < 5):
                    # If the district number in the column name doesn't equal the district name in the assignment column
                    if (col_district != district_assignment):
                        print(val.find(finding))
                        print(re.findall(regex_string, val))
                        if not(any_error):
                            print("***ERROR SPOTTED***")
                        print("District Assignment: ", district_assignment)
                        print("Value" , col_district)
                        print("Column", val)
                        print("Number of votes", row[val])
                        print(row["UNIQUE_ID"])
                        print(" ")
                        any_error = True
    return any_error
def run_validation(election_results_df):
    '''
    This code validates a handful of things in a given ERJ file:
    1) Whether the values in the "UNIQUE_ID" column are indeed unique
    2) That there is a "COUNTYFP" column
    3) That "UNIQUE_ID" and "COUNTYFP" are in the right order within the columns
    4) That the actual votes assigned in cases of districts match the stated district assignments.
        In other words, if we say a precinct is in CONG_DIST 4, does it only receive votes in CONG_DIST 4
        This function calls the "district_assignment_errors" function above to perform this check
    '''
    
    # Confirm that UNIQUE_ID column is unique
    assert(max(election_results_df["UNIQUE_ID"].value_counts(dropna=False)) == 1), "Non-unique UNIQUE_ID"
    print("CONFIRMED: UNIQUE_ID is unique")
    
    # Confirm that COUNTYFP column has been added
    assert('COUNTYFP' in list(election_results_df.columns)), "No 'COUNTYFP' column"
    print("CONFIRMED: COUNTYFP column added")
    
    #Check order of columns
    assert((list(election_results_df.columns)[0]=="UNIQUE_ID") and (list(election_results_df.columns)[1]=="COUNTYFP")), "Incorrect column placement"
    print("CONFIRMED: UNIQUE_ID and COUNTYFP columns in the right position")
    
    # Confirm that stated district assignment, matches actual district assignment
    contains_cong_assignment = "CONG_DIST" in list(election_results_df.columns) or "CON_DIST" in list(election_results_df.columns)
    if(contains_cong_assignment):
        assert(not(district_assignment_errors(election_results_df, "CONG_DIST"))), "Bad CONG_DIST assignment"
        print("CONFIRMED: Actual congressional votes match CONG_DIST assignment")
        
    contains_sldu_assignment = "SLDU_DIST" in list(election_results_df.columns)
    if(contains_sldu_assignment):
        assert(not(district_assignment_errors(election_results_df, "SLDU_DIST"))), "Bad SLDU_DIST assignment"
        print("CONFIRMED: Actual su votes match SLDU_DIST assignment")
        
    contains_sldl_assignment = "SLDL_DIST" in list(election_results_df.columns)
    if(contains_sldl_assignment):
        assert(not(district_assignment_errors(election_results_df, "SLDL_DIST"))), "Bad SLDL_DIST assignment"
        print("CONFIRMED: Actual sl votes match SLDL_DIST assignment")    
def validate_file(file_name):
    '''
    This is the function that will call the other needed functions
    
    file_name : Should be a string, something like "az_gen_20_prec"
    '''
    files = os.listdir("./"+file_name+"/")
    
    # Iterate over the files within that filepath
    for val in files:
        if ".shp" in val:
            print("Running check on:", val)
            file_load = gp.read_file("./"+file_name+"/"+val)
            run_validation(file_load)
            print("")
# Fill this in with your file name
file_name = "ga_prim_20_prec"

# Run the below to call the validation
validate_file(file_name)

# Example Call
# file_name = "az_gen_20_prec"
# validate_file(file_name)

Running check on: ga_prim_20_sldu_prec.shp
CONFIRMED: UNIQUE_ID is unique
CONFIRMED: COUNTYFP column added
CONFIRMED: UNIQUE_ID and COUNTYFP columns in the right position
CONFIRMED: Actual su votes match SLDU_DIST assignment

Running check on: ga_prim_20_sldl_prec.shp
CONFIRMED: UNIQUE_ID is unique
CONFIRMED: COUNTYFP column added
CONFIRMED: UNIQUE_ID and COUNTYFP columns in the right position
CONFIRMED: Actual sl votes match SLDL_DIST assignment

Running check on: ga_prim_20_st_prec.shp
CONFIRMED: UNIQUE_ID is unique
CONFIRMED: COUNTYFP column added
CONFIRMED: UNIQUE_ID and COUNTYFP columns in the right position

Running check on: ga_prim_20_cong_prec.shp
CONFIRMED: UNIQUE_ID is unique
CONFIRMED: COUNTYFP column added
CONFIRMED: UNIQUE_ID and COUNTYFP columns in the right position
CONFIRMED: Actual congressional votes match CONG_DIST assignment

