In [81]:
import geopandas as gp
import pandas as pd
import os
import re

In [82]:
level_race_name_dict = {"CONG_DIST":"CON",
    "SLDL_DIST":"SL",
    "SLDU_DIST":"SU"}

In [83]:
def district_assignment_errors(election_results_df, level):
    '''
    This function will check whether the votes in a precinct match the district assignment
    Note: As written, this will only work for fully numeric district assignments
    '''
    
    # This boolean is used to help clean up the print statements
    any_error = False
    # Convert from the name of the district assignment column, to how it is referred to in column names
    finding = level_race_name_dict[level]
    # Iterate over the dataframe row-by-row
    for index,row in election_results_df.iterrows():
        # Get the district assignment for that row
        district_assignment = row[level]
        # Iterate over every row column by column
        for val in row.index:
            # If the name for the type of district is in the column name and there are non-zero votes
            if finding in val and row[val] != 0 and val!= level and val not in ["SLDU_DIST","SLDL_DIST","CONG_DIST"]:
                # Grab the numbers for district assignments
                regex_string = finding+'\d*'
                # The len(finding) part is needed here as sometimes there is more than one digit to the district
                col_district = re.findall(regex_string, val)[0][len(finding):]
                # Makes sure that "CON", "SL", or "SU" wasn't found in a name (more than 5 characters into the col name)
                if (val.find(finding) < 5):
                    # If the district number in the column name doesn't equal the district name in the assignment column
                    if (col_district != district_assignment):
                        print(val.find(finding))
                        print(re.findall(regex_string, val))
                        if not(any_error):
                            print("***ERROR SPOTTED***")
                        print("District Assignment: ", district_assignment)
                        print("Value" , col_district)
                        print("Column", val)
                        print("Number of votes", row[val])
                        print(row["UNIQUE_ID"])
                        print(" ")
                        any_error = True
    return any_error

In [96]:
def run_validation(election_results_df):
    '''
    This code validates a handful of things in a given ERJ file:
    1) Whether the values in the "UNIQUE_ID" column are indeed unique
    2) That there is a "COUNTYFP" column
    3) That "UNIQUE_ID" and "COUNTYFP" are in the right order within the columns
    4) That the actual votes assigned in cases of districts match the stated district assignments.
        In other words, if we say a precinct is in CONG_DIST 4, does it only receive votes in CONG_DIST 4
        This function calls the "district_assignment_errors" function above to perform this check
    '''
    
    # Confirm that UNIQUE_ID column is unique
    assert(max(election_results_df["UNIQUE_ID"].value_counts(dropna=False)) == 1), "Non-unique UNIQUE_ID"
    print("CONFIRMED: UNIQUE_ID is unique")
    
    # Confirm that COUNTYFP column has been added
    assert('COUNTYFP' in list(election_results_df.columns)), "No 'COUNTYFP' column"
    print("CONFIRMED: COUNTYFP column added")
    
    #Check order of columns
    assert((list(election_results_df.columns)[0]=="UNIQUE_ID") and (list(election_results_df.columns)[1]=="COUNTYFP")), "Incorrect column placement"
    print("CONFIRMED: UNIQUE_ID and COUNTYFP columns in the right position")
    
    # Confirm that stated district assignment, matches actual district assignment
    contains_cong_assignment = "CONG_DIST" in list(election_results_df.columns) or "CON_DIST" in list(election_results_df.columns)
    if(contains_cong_assignment):
        assert(not(district_assignment_errors(election_results_df, "CONG_DIST"))), "Bad CONG_DIST assignment"
        print("CONFIRMED: Actual congressional votes match CONG_DIST assignment")
        
    contains_sldu_assignment = "SLDU_DIST" in list(election_results_df.columns)
    if(contains_sldu_assignment):
        assert(not(district_assignment_errors(election_results_df, "SLDU_DIST"))), "Bad SLDU_DIST assignment"
        print("CONFIRMED: Actual su votes match SLDU_DIST assignment")
        
    contains_sldl_assignment = "SLDL_DIST" in list(election_results_df.columns)
    if(contains_sldl_assignment):
        assert(not(district_assignment_errors(election_results_df, "SLDL_DIST"))), "Bad SLDL_DIST assignment"
        print("CONFIRMED: Actual sl votes match SLDL_DIST assignment")    

In [97]:
def validate_file(file_name):
    '''
    This is the function that will call the other needed functions
    
    file_name : Should be a string, something like "az_gen_20_prec"
    '''
    files = os.listdir("./"+file_name+"/")
    
    # Iterate over the files within that filepath
    for val in files:
        if ".shp" in val:
            print("Running check on:", val)
            file_load = gp.read_file("./"+file_name+"/"+val)
            run_validation(file_load)
            print("")

In [99]:
# Fill this in with your file name
file_name = ""

# Run the below to call the validation
validate_file(file_name)

# Example Call
# file_name = "az_gen_20_prec"
# validate_file(file_name)

Running check on: az_gen_20_prec.shp
CONFIRMED: UNIQUE_ID is unique
CONFIRMED: COUNTYFP column added
CONFIRMED: UNIQUE_ID and COUNTYFP columns in the right position
CONFIRMED: Actual congressional votes match CONG_DIST assignment
CONFIRMED: Actual su votes match SLDU_DIST assignment
CONFIRMED: Actual sl votes match SLDL_DIST assignment

Running check on: az_prim_20_prec.shp
CONFIRMED: UNIQUE_ID is unique
CONFIRMED: COUNTYFP column added
CONFIRMED: UNIQUE_ID and COUNTYFP columns in the right position
CONFIRMED: Actual congressional votes match CONG_DIST assignment
CONFIRMED: Actual su votes match SLDU_DIST assignment
CONFIRMED: Actual sl votes match SLDL_DIST assignment

Running check on: fl_gen_20_sldu_prec.shp
CONFIRMED: UNIQUE_ID is unique
CONFIRMED: COUNTYFP column added
CONFIRMED: UNIQUE_ID and COUNTYFP columns in the right position
CONFIRMED: Actual su votes match SLDU_DIST assignment

Running check on: fl_gen_20_sldl_prec.shp
CONFIRMED: UNIQUE_ID is unique
CONFIRMED: COUNTYFP col

AssertionError: Bad SLDL_DIST assignment