In [1]:
import geopandas as gp
import pandas as pd
import os
import re

#THESE SETTINGS ESSENTIAL TO HAVE THE FIELDS TABLE SHOW UP CORRECTLY in the readme
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [2]:
level_race_name_dict = {"CONG_DIST":"CON",
    "SLDL_DIST":"SL",
    "SLDU_DIST":"SU",
    "SCONG_DIST":"SCON"}

In [9]:
def district_assignment_errors(election_results_df, level):
    '''
    This function will check whether the votes in a precinct match the district assignment
    Note: As written, this will only work for fully numeric district assignments
    '''
    
    # This boolean is used to help clean up the print statements
    any_error = False
    # Convert from the name of the district assignment column, to how it is referred to in column names
    finding = level_race_name_dict[level]
    # Iterate over the dataframe row-by-row
    for index,row in election_results_df.iterrows():
        # Get the district assignment for that row
        district_assignment = row[level]
        # Iterate over every row column by column
        for val in row.index:
            # If the name for the type of district is in the column name and there are non-zero votes
            if finding in val and row[val] != 0 and val!= level and val not in ["SLDU_DIST","SLDL_DIST","CONG_DIST"]:
                # Grab the numbers for district assignments
                regex_string = finding+'\d*'
                # The len(finding) part is needed here as sometimes there is more than one digit to the district
                col_district = re.findall(regex_string, val)[0][len(finding):]
                # Makes sure that "CON", "SL", or "SU" wasn't found in a name (more than 5 characters into the col name)
                if (val.find(finding) < 5):
                    # If the district number in the column name doesn't equal the district name in the assignment column
                    if (col_district != district_assignment.zfill(2)):
                        print(val.find(finding))
                        print(re.findall(regex_string, val))
                        if not(any_error):
                            print("***ERROR SPOTTED***")
                        print("District Assignment: ", district_assignment)
                        print("Value" , col_district)
                        print("Column", val)
                        print("Number of votes", row[val])
                        print(row["UNIQUE_ID"])
                        print(" ")
                        any_error = True
    return any_error

In [10]:
def run_validation(election_results_df):
    '''
    This code validates a handful of things in a given ERJ file:
    1) Whether the values in the "UNIQUE_ID" column are indeed unique
    2) That there is a "COUNTYFP" column
    3) That "UNIQUE_ID" and "COUNTYFP" are in the right order within the columns
    4) That the actual votes assigned in cases of districts match the stated district assignments.
        In other words, if we say a precinct is in CONG_DIST 4, does it only receive votes in CONG_DIST 4
        This function calls the "district_assignment_errors" function above to perform this check
    '''
    
    # Confirm that UNIQUE_ID column is unique
    assert(max(election_results_df["UNIQUE_ID"].value_counts(dropna=False)) == 1), "Non-unique UNIQUE_ID"
    print("CONFIRMED: UNIQUE_ID is unique")
    
    # Confirm that COUNTYFP column has been added
    assert('COUNTYFP' in list(election_results_df.columns)), "No 'COUNTYFP' column"
    print("CONFIRMED: COUNTYFP column added")
    
    #Check order of columns
    assert((list(election_results_df.columns)[0]=="UNIQUE_ID") and (list(election_results_df.columns)[1]=="COUNTYFP")), "Incorrect column placement"
    print("CONFIRMED: UNIQUE_ID and COUNTYFP columns in the right position")
    
    # Confirm that stated district assignment, matches actual district assignment
    contains_cong_assignment = "CONG_DIST" in list(election_results_df.columns) or "CON_DIST" in list(election_results_df.columns)
    if(contains_cong_assignment):
        assert(not(district_assignment_errors(election_results_df, "CONG_DIST"))), "Bad CONG_DIST assignment"
        print("CONFIRMED: Actual congressional votes match CONG_DIST assignment")
        
    contains_sldu_assignment = "SLDU_DIST" in list(election_results_df.columns)
    if(contains_sldu_assignment):
        assert(not(district_assignment_errors(election_results_df, "SLDU_DIST"))), "Bad SLDU_DIST assignment"
        print("CONFIRMED: Actual su votes match SLDU_DIST assignment")
        
    contains_sldl_assignment = "SLDL_DIST" in list(election_results_df.columns)
    if(contains_sldl_assignment):
        #assert(not(district_assignment_errors(election_results_df, "SLDL_DIST"))), "Bad SLDL_DIST assignment"
        print("CONFIRMED: Actual sl votes match SLDL_DIST assignment") 
        
    contains_scong_assignment = "SCONG_DIST" in list(election_results_df.columns)
    if(contains_scong_assignment):
        #assert(not(district_assignment_errors(election_results_df, "SLDL_DIST"))), "Bad SLDL_DIST assignment"
        print("CONFIRMED: Actual scong votes match SCONG_DIST assignment") 

In [11]:
def validate_file(file_name):
    '''
    This is the function that will call the other needed functions
    
    file_name : Should be a string, something like "az_gen_20_prec"
    '''
    files = os.listdir("./"+file_name+"/")
    
    # Iterate over the files within that filepath
    for val in files:
        if ".shp" in val:
            print("Running check on:", val)
            file_load = gp.read_file("./"+file_name+"/"+val)
            print(file_load.columns)
            run_validation(file_load)
            print("")

In [12]:
# Fill this in with your file name
file_name = "va_gen_20_prec"

# Run the below to call the validation
validate_file(file_name)

# Example Call
# file_name = "az_gen_20_prec"
# validate_file(file_name)

Running check on: va_gen_20_st_cong_prec.shp
Index(['UNIQUE_ID', 'COUNTYFP', 'LOCALITY', 'VTDST', 'PRECINCT', 'CONG_DIST',
       'G20PREDBID', 'G20PRELJOR', 'G20PREOWRI', 'G20PRERTRU', 'G20USSDWAR',
       'G20USSOWRI', 'G20USSRGAD', 'GCON01DRAS', 'GCON01OWRI', 'GCON01RWIT',
       'GCON02DLUR', 'GCON02IFOS', 'GCON02OWRI', 'GCON02RTAY', 'GCON03DSCO',
       'GCON03OWRI', 'GCON03RCOL', 'GCON04DMCE', 'GCON04OWRI', 'GCON04RBEN',
       'GCON05DWEB', 'GCON05OWRI', 'GCON05RGOO', 'GCON06DBET', 'GCON06OWRI',
       'GCON06RCLI', 'GCON07DSPA', 'GCON07OWRI', 'GCON07RFRE', 'GCON08DBEY',
       'GCON08OWRI', 'GCON08RJOR', 'GCON09OWRI', 'GCON09RGRI', 'GCON10DWEX',
       'GCON10OWRI', 'GCON10RAND', 'GCON11DCON', 'GCON11OWRI', 'GCON11RANA',
       'geometry'],
      dtype='object')
CONFIRMED: UNIQUE_ID is unique
CONFIRMED: COUNTYFP column added
CONFIRMED: UNIQUE_ID and COUNTYFP columns in the right position
CONFIRMED: Actual congressional votes match CONG_DIST assignment



In [13]:
'''Following Docstring Convention: https://www.python.org/dev/peps/pep-0257/'''

import pandas as pd
import os
#THESE SETTINGS ESSENTIAL TO HAVE THE FIELDS TABLE SHOW UP CORRECTLY in the readme
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)


def create_fields_table(race_field_header_0, fields_dict_0, 
                        add_race_field_header_1 = '', fields_dict_1 = {}, 
                        add_race_field_header_2 = '', fields_dict_2 = {}, 
                        add_race_field_header_3 = '', fields_dict_3 = {}):
    '''Purpose: Create fields table used in readme based on field dictionary created separately
    Arguments:
        race_field_header_0: include asterisks "***text***" and label first set of fields
        fields_dict_0: the default dictionary for the primary file (statewide)
        add_race_field_header_1: include asterisks to draw attention to section - ex: "***additional_race_file_name_fields***"
        fields_dict_1: additional fields to go under add_race_field_header_1 header
        add_race_field_header_2 and _3: same use as add_race_field_header_1 - include as needed
        fields_dict_2 and _3: same use as fields_dict_1 - include as needed associated with corresponding add_race_field_header section
    '''
    fields_table_data = {'Field Name': ['',race_field_header_0]  + list(fields_dict_0.keys()) +
                         ['',add_race_field_header_1] + list(fields_dict_1.keys()) +
                         ['',add_race_field_header_2] + list(fields_dict_2.keys()) +
                         ['',add_race_field_header_3] + list(fields_dict_3.keys()),
                         'Description': ['',''] + list(fields_dict_0.values()) + 
                         ['',''] + list(fields_dict_1.values()) + 
                         ['',''] + list(fields_dict_2.values()) +
                         ['',''] + list(fields_dict_3.values())}
    fields_table = pd.DataFrame(fields_table_data)
    return fields_table

def erj_readme_template(stateabrv, state, year, election_type, additional_races, retrieval_date, vest_file_link, 
                        raw_data_source, state_erj_repo, office_codes, 
                        race_field_header_0, fields_dict_0, add_race_field_header_1 = '', fields_dict_1 = {}, add_race_field_header_2 = '', fields_dict_2 = {}, 
                        add_race_field_header_3 = '', fields_dict_3 = {},
                       additional_notes=' '):
    '''Purpose: standardize language in ERJ file README.txt
    Arguemts:
        fields_dict = used to create the fields table for the non-standardized/race fields fields. 
                    Key is the field/value is the field description
        stateabrv = two character state abbreviation capitalized, 
        state = state name, first letter capitalized, 
        year = election year (XXXX), 
        election_type = general, primary, special or runoff, 
        additional_races = the races that RDH added to the original vest file (not in VEST's og file), 
        retrieval_date = date RDH retrieved VEST file, 
        vest_file_link = link to dataverse page for VEST's precinct boundary and election results file, 
        raw_data_source = site description and link, 
        state_erj_repo = link to erj github repository for given state
        office_codes = codes used broken off of field names for easy viewing. 
            For SU/SL/CON, include ##, so SU## for office code
        race_field_header_0: include asterisks "***text***" and label first set of fields
        fields_dict_0: the default dictionary for the primary file (statewide)
        add_race_field_header_1: include asterisks to draw attention to section - ex: "***additional_race_file_name_fields***"
        fields_dict_1: additional fields to go under add_race_field_header_1 header
        add_race_field_header_2 and _3: same use as add_race_field_header_1 - include as needed
        fields_dict_2 and _3: same use as fields_dict_1 - include as needed associated with corresponding add_race_field_header section
        additional_notes = default set to empty, but fill in with string where applicable.
    '''
#First section of README
    readme_p1 = '''{year} {stateabrv} {election_type} Precinct Boundary and Election Results
## RDH Date Retrieval
{retrieval_date}
## Sources
The RDH retrieved the VEST {year} {election_type} precinct boundary and election results shapefile from [VEST's Harvard Dataverse]({vest_file_link})
The RDH retrieved raw {year} {election_type} election results from {raw_data_source}
## Notes on Field Names (adapted from VEST):
Columns reporting votes generally follow the pattern: 
The first character is G for a general election, P for a primary, S for a special, and R for a runoff.
Characters 2 and 3 are the year of the election.*
Characters 4-6 represent the office type (see list below).
Character 7 represents the party of the candidate.
Characters 8-10 are the first three letters of the candidate's last name.
One example is:
G16PREDCLI
To fit within the GIS 10 character limit for field names, the naming convention is slightly different for the State Legislature and 
US House of Representatives. All fields are listed below with definitions.
Office Codes Used:
{office_codes}
## Fields:
'''.format(stateabrv= stateabrv, state= state, year=year, election_type=election_type, additional_races=additional_races,retrieval_date=retrieval_date, vest_file_link=vest_file_link, raw_data_source=raw_data_source, state_erj_repo=state_erj_repo, office_codes=office_codes)

#Second section of README
    fields_table = create_fields_table(race_field_header_0, fields_dict_0, add_race_field_header_1, fields_dict_1, add_race_field_header_2, fields_dict_2, 
                        add_race_field_header_3, fields_dict_3)
    readme_p2 = fields_table.to_string(formatters={'Description':'{{:<{}s}}'.format(fields_table['Description'].str.len().max()).format, 'Field Name':'{{:<{}s}}'.format(fields_table['Field Name'].str.len().max()).format}, index=False)

#Third section of README
    readme_p3 = '''
## Processing Steps
    
The RDH joined additional election results to VEST's existing precinct shapefile, including {additional_races} using Python.
For more information on the processing completed, visit our [Github repository]({state_erj_repo}) for Election Result Joins (ERJ) for {state}.
Where possible, the RDH validated the election results we processed against VEST's election results. For additional races the RDH manually checked state totals. For more information on this comparison, please see our processing on Github ({state_erj_repo}).
## Additional Notes
{additional_notes}
Please contact info@redistrictingdatahub.org for more information.
'''.format(stateabrv=stateabrv, state=state, year=year, election_type=election_type, additional_races=additional_races, state_erj_repo=state_erj_repo, office_codes=office_codes, additional_notes = additional_notes)
    
    full_readme = str(readme_p1)+str(readme_p2)+str(readme_p3)
    return full_readme

def export_readme(readme_name, state, election_type, full_readme_text):
    
    '''Purpose: Turn README string into a txt file in the ERJ folder
    Argument note:
        readme_name must include file path to readme within erj folder
        ex: 
        readme_name = './az_gen_20_prec/README.txt'
    '''
    with open(readme_name, 'x') as tf:
        tf.write(full_readme_text)
    print(state, election_type, " readme moved to folder")

In [14]:
stateabrv = "VA"
state = "Virginia"
year = "2020"
election_type = "General"
additional_races = "President, U.S. Senator, U.S. Congressional, U.S. Congressional Special Election"
retrieval_date = "08/22/2022"
upload_date = "12/08/2022" 
vest_file_link = "https://dataverse.harvard.edu/file.xhtml?fileId=6174181&version=40.0"
raw_data_source = "Election results from Open Elections (https://historical.elections.virginia.gov/) and precinct shapefiles from VEST's VA 2020 file.\n" 
state_erj_repo = "https://github.com/nonpartisan-redistricting-datahub/erj-va"
                
additional_notes = '''
~all files~

'''


In [15]:
va_cand_dict = {'All Others-:-Election11':'GCON11OWRI',
       'Elaine Goodman Luria-:-Democratic-:-Election2':'GCON02DLUR',
       'Scott William Taylor-:-Republican-:-Election2':'GCON02RTAY',
       'David Bruce Foster-:-Independent-:-Election2':'GCON02IFOS',
       'All Others-:-Election2':'GCON02OWRI',
       'Robert Joseph Wittman-:-Republican-:-Election1':'GCON01RWIT',
       'Qasim Rashid-:-Democratic-:-Election1':'GCON01DRAS',
       'All Others-:-Election1':'GCON01OWRI',
       'Robert George Good-:-Republican-:-Election5':'GCON05RGOO',
       'Bryant Cameron Webb-:-Democratic-:-Election5':'GCON05DWEB',
       'All Others-:-Election5':'GCON05OWRI',
    'Robert Cortez Scott-:-Democratic-:-Election3':'GCON03DSCO',
       'John William Collick, Jr-:-Republican-:-Election3':'GCON03RCOL',
       'All Others-:-Election3':'GCON03OWRI',
       'Jennifer Tosini Wexton-:-Democratic-:-Election10':'GCON10DWEX',
       'Aliscia Nicole Andrews-:-Republican-:-Election10':'GCON10RAND',
       'All Others-:-Election10':'GCON10OWRI',
       'Abigail Anne Davis Spanberger-:-Democratic-:-Election7':'GCON07DSPA',
       'Nicholas J. Freitas-:-Republican-:-Election7':'GCON07RFRE',
       'All Others-:-Election7':'GCON07OWRI',
       'Donald Sternoff Beyer, Jr.-:-Democratic-:-Election8':'GCON08DBEY',
       'Jeffrey Alan Jordan-:-Republican-:-Election8':'GCON08RJOR',
       'All Others-:-Election8':'GCON08OWRI',
       'A. Donald McEachin-:-Democratic-:-Election4':'GCON04DMCE',
       'Leon Benjamin, Sr-:-Republican-:-Election4':'GCON04RBEN',
       'All Others-:-Election4':'GCON04OWRI',
       'Howard Morgan Griffith-:-Republican-:-Election9':'GCON09RGRI',
       'All Others-:-Election9':'GCON09OWRI',
       'Benjamin Lee Cline-:-Republican-:-Election6':'GCON06RCLI',
       'Nicholas Anthony Betts-:-Democratic-:-Election6':'GCON06DBET',
       'All Others-:-Election6':'GCON06OWRI',
       'Gerald Edward Connolly-:-Democratic-:-Election11':'GCON11DCON',
       'Manga Alamelu Anantatmula-:-Republican-:-Election11':'GCON11RANA',
                'Joseph Robinette Biden, Jr-:-Democratic-:-President':"G20PREDBID",
'Donald J. Trump-:-Republican-:-President':"G20PRERTRU",
'Jo Jorgensen-:-Libertarian-:-President':"G20PRELJOR", 
'All Others-:-President':'G20PREOWRI',
'Mark Robert Warner-:-Democratic-:-Senate':"G20USSDWAR",
'Daniel Mac Arthur Gade-:-Republican-:-Senate':"G20USSRGAD",
'All Others-:-Senate':"G20USSOWRI"
}

In [22]:
races_list = ['G20PREDBID', 'G20PRELJOR', 'G20PREOWRI', 'G20PRERTRU', 'G20USSDWAR', 'G20USSOWRI', 'G20USSRGAD', 'GCON01DRAS', 'GCON01OWRI', 'GCON01RWIT', 'GCON02DLUR', 'GCON02IFOS', 'GCON02OWRI', 'GCON02RTAY', 'GCON03DSCO', 'GCON03OWRI', 'GCON03RCOL', 'GCON04DMCE', 'GCON04OWRI', 'GCON04RBEN', 'GCON05DWEB', 'GCON05OWRI', 'GCON05RGOO', 'GCON06DBET', 'GCON06OWRI', 'GCON06RCLI', 'GCON07DSPA', 'GCON07OWRI', 'GCON07RFRE', 'GCON08DBEY', 'GCON08OWRI', 'GCON08RJOR', 'GCON09OWRI', 'GCON09RGRI', 'GCON10DWEX', 'GCON10OWRI', 'GCON10RAND', 'GCON11DCON', 'GCON11OWRI', 'GCON11RANA']

In [23]:
cleaner_cand_dict = {va_cand_dict[i]:i for i in va_cand_dict.keys()}

In [28]:
sorted_cand_dict = {i:cleaner_cand_dict[i].split("-:-")[0] for i in races_list}

In [29]:
sorted_cand_dict

{'G20PREDBID': 'Joseph Robinette Biden, Jr',
 'G20PRELJOR': 'Jo Jorgensen',
 'G20PREOWRI': 'All Others',
 'G20PRERTRU': 'Donald J. Trump',
 'G20USSDWAR': 'Mark Robert Warner',
 'G20USSOWRI': 'All Others',
 'G20USSRGAD': 'Daniel Mac Arthur Gade',
 'GCON01DRAS': 'Qasim Rashid',
 'GCON01OWRI': 'All Others',
 'GCON01RWIT': 'Robert Joseph Wittman',
 'GCON02DLUR': 'Elaine Goodman Luria',
 'GCON02IFOS': 'David Bruce Foster',
 'GCON02OWRI': 'All Others',
 'GCON02RTAY': 'Scott William Taylor',
 'GCON03DSCO': 'Robert Cortez Scott',
 'GCON03OWRI': 'All Others',
 'GCON03RCOL': 'John William Collick, Jr',
 'GCON04DMCE': 'A. Donald McEachin',
 'GCON04OWRI': 'All Others',
 'GCON04RBEN': 'Leon Benjamin, Sr',
 'GCON05DWEB': 'Bryant Cameron Webb',
 'GCON05OWRI': 'All Others',
 'GCON05RGOO': 'Robert George Good',
 'GCON06DBET': 'Nicholas Anthony Betts',
 'GCON06OWRI': 'All Others',
 'GCON06RCLI': 'Benjamin Lee Cline',
 'GCON07DSPA': 'Abigail Anne Davis Spanberger',
 'GCON07OWRI': 'All Others',
 'GCON07RF

In [31]:
import os

fields_dict_0 = sorted_cand_dict
race_field_header_0 = "***va_gen_20_st_cong_prec.zip***"

In [32]:
fields_dict_0["UNIQUE_ID"] = "Unique Identifier"
fields_dict_0['COUNTYFP'] = "County FIP"
fields_dict_0['LOCALITY'] = "State FIP"
fields_dict_0["VTDST"] = "VTD Code"
fields_dict_0['PRECINCT'] = "Precinct Name"
fields_dict_0['CONG_DIST'] = "Congressional District"

In [33]:
fields_dict = {}
fields_dict["UNIQUE_ID"] = "Unique Identifier"
fields_dict['COUNTYFP'] = "County FIP"
fields_dict['STATEFP'] = "State FIP"
fields_dict["VTDST"] = "VTD Code"
fields_dict["NAME"] = "VTD Name"
fields_dict["COUNTY"] = "County Name"
fields_dict["PREC_NAME"] = "Precinct Name"

fields_dict["CONG_DIST"] = "U.S. Congressional District"

In [34]:
office_dict = {}


office_dict['PRE'] = 'President'
office_dict["USS"] = 'U.S. Senator'
office_dict["CONXX"] = 'U.S. Congress'




In [35]:
office_codes = office_dict

In [36]:
full_readme = erj_readme_template(fields_dict, stateabrv, state, year, election_type, additional_races, retrieval_date, upload_date, vest_file_link, 
                        raw_data_source, state_erj_repo, office_codes, additional_notes)

In [37]:
full_readme = erj_readme_template(stateabrv, state, year, election_type, additional_races, retrieval_date, vest_file_link, 
                        raw_data_source, state_erj_repo,
                        office_codes, 
                        race_field_header_0, fields_dict_0,
                       additional_notes=' ')

In [38]:
export_readme("./va_gen_20_prec/README.txt", "Virginia", "general", full_readme)

Virginia general  readme moved to folder
