## Precinct boundary file creation

In [1]:
import os
import numpy as np 
import geopandas as gp
import pandas as pd
import maup
from op_verification import reference_data
import time
import warnings
warnings.filterwarnings('ignore')

import re

#2022 precinct election results, no geospatial data
er22 = pd.read_csv('./raw-from-source/oh_2022_gen_prec/oh_2022_gen_prec.csv')
#New precinct boundaries since 2020 --> new precinct name matching required. "madison", "huron","allen" initially in list but removed after confirming 2022 pdf map with 2020 geospatial data
prec_counties_new22_set = {"brown","butler","clark","clermont","columbiana","cuyahoga","delaware", "erie","geauga","hamilton","hardin", 
                           "hocking","lake","lucas","lorain","marion","medina","mercer","miami","montgomery","muskingum","pickaway","portage",
                           "stark","tuscarawas","wood"}


#2020 precinct boundaries and election results file filtered for only boundaries -- 2022 boundaries below in "Precinct name matching 2022 section"
bound20rdh = gp.read_file("./raw-from-source/oh_gen_20_prec/oh_gen_20_st_prec.shp")[['UNIQUE_ID', "STATEFP20","COUNTYFP20", "PRECINCT20","NAME20","geometry"]]
#Format county/county name columns to maptch ER22 for comparison
bound20rdh['COUNTY'] = ("39"+bound20rdh['COUNTYFP20']).map(reference_data.geoid_to_county_name)
bound20rdh['COUNTYNM'] = bound20rdh['COUNTY'].str.lower().str.slice(stop=-7)
county_name_to_fips_dict = pd.Series(bound20rdh['COUNTYFP20'].values, index=bound20rdh['COUNTYNM']).to_dict()
print("Compare all precinct data between 2020 and 2022:")
#Confirm county names match up
assert set(bound20rdh['COUNTYNM'])-set(er22["County"].str.lower()) == set()
assert set(er22["County"].str.lower())-set(bound20rdh['COUNTYNM']) == set()
#Considering 2020/2022 comparison some more
print("2022 \nER unique_id nunique: ",er22["UNIQUE_ID"].nunique() ,"\nshape: ", er22["UNIQUE_ID"].shape, "\nprec name nunique: ", er22["PRECNAME"].nunique(), "\nprec name w county nunique",(er22["PRECCODE"]+er22["County"]).nunique())
print("\n2020 \nPB NAME20 nunique: ",bound20rdh["NAME20"].nunique(),"\nshape:",bound20rdh.shape, "\nprec name nunique: ",bound20rdh["PRECINCT20"].nunique(),"\nprec name w county nunique",(bound20rdh["PRECINCT20"]+bound20rdh["COUNTYNM"]).nunique())

print("\nCompare subset of counties that we expect to match between 2020 and 2022:")
#set of counties with reused precinct boundaries from pre-2020, therefore can use 2020 geospatial data
counties_reused22_set = set(er22["County"].str.lower())-prec_counties_new22_set
#2022 ER df subset counties where geospatial data can be reused
er22_reused_counties_df = er22[er22["County"].str.lower().isin(counties_reused22_set)]
#2020 Bound gdf subset counties where geospatial data can be reused
bound20_reused_counties_gdf = bound20rdh[(bound20rdh["COUNTYNM"].isin(counties_reused22_set))|(bound20rdh["PRECINCT20"]=="ZZZ")]
#Check if Precincts line up/what mismatches - as expected, only for ZZZ 0 voter precicnts
print("prec w/county diff: pb comp er len: ", len(set(bound20_reused_counties_gdf["PRECINCT20"]+bound20_reused_counties_gdf["COUNTYNM"].str.lower())-set(er22_reused_counties_df["PRECCODE"]+er22_reused_counties_df["County"].str.lower())))
print("prec w/county diff: er comp pb len: ", len(set(er22_reused_counties_df["PRECCODE"]+er22_reused_counties_df["County"].str.lower())-set(bound20_reused_counties_gdf["PRECINCT20"]+bound20_reused_counties_gdf["COUNTYNM"])))
print("prec county diff: er comp pb names: ", set(er22_reused_counties_df["PRECCODE"]+er22_reused_counties_df["County"].str.lower())-set(bound20_reused_counties_gdf["PRECINCT20"]+bound20_reused_counties_gdf["COUNTYNM"]))
print("prec county diff: pb comp er names: ", set(bound20_reused_counties_gdf["PRECINCT20"]+bound20_reused_counties_gdf["COUNTYNM"].str.lower())-set(er22_reused_counties_df["PRECCODE"]+er22_reused_counties_df["County"].str.lower()))

#Create common column to join on
er22_reused_counties_df["UNIQUE_ID_code"] = er22_reused_counties_df["County"].str.upper() + "-"+er22_reused_counties_df["PRECCODE"]
bound20_reused_counties_gdf["UNIQUE_ID_code"] = bound20_reused_counties_gdf["COUNTYNM"].str.upper()+"-"+bound20_reused_counties_gdf["PRECINCT20"]
#Join 2020 bounds with 2022 ER where appropriate
pber_prec_reused_gdf = pd.merge(er22_reused_counties_df, bound20_reused_counties_gdf, on = "UNIQUE_ID_code", how = "outer", indicator=True)
print("\nCompare post-merge:")
print("shape merged gdf: ",pber_prec_reused_gdf.shape, "\nunmatched shape: ",pber_prec_reused_gdf[pber_prec_reused_gdf["_merge"]!="both"].shape)
print("county from er not matching: ",pber_prec_reused_gdf["County"][pber_prec_reused_gdf["_merge"]!="both"].unique())
print("county from bound not matching, not zzz: ",pber_prec_reused_gdf["COUNTYNM"][(pber_prec_reused_gdf["_merge"]!="both")&(pber_prec_reused_gdf["PRECINCT20"]!="ZZZ")].unique())


def check_precid_uniqueness(county_gdf_list):
    '''
    Checks that the "prec_id" column created for each county gdf is unique within the county and that it matches the precinct identifiers in the election results df. 
    If it is not unique or does not match, function will return and assertion error
    Parameter county_df_list: list of dataframes where each dataframe is one county to be considered 
    '''
    print("number of counties in list: ",len(county_gdf_list))
    for county in county_gdf_list:
        print("check prec_id for: ", county["county"][0])
        assert county["prec_id"].nunique()==len(county)
        assert set(county["prec_id"])-set(er22["PRECNAME"][er22["County"].str.upper()==county["county"][0]])==set(er22["PRECNAME"][er22["County"].str.upper()==county["county"][0]])-set(county["prec_id"])==set()
    
    return "all counties pass check"


print("Number of counties new to 2022//need precinct name matching: ",len(prec_counties_new22_set))


#Clark: Full match
clark = gp.read_file("./raw-from-source/boundaries/clark_precincts_2023/ClarkCountyPrecinctData.shp")
clark['prec_id'] = "PRECINCT "+clark['NAME']
clark.loc[clark['NAME'].str.contains("BETHEL"), "prec_id"] = clark['prec_id'].str.replace("BETHEL", "BETH")
clark.loc[clark['NAME'].str.contains(" 0"), "prec_id"] = clark['prec_id'].str.replace(" 0", " ")
clark.loc[clark['NAME'].str.contains("MR "), "prec_id"] = clark['prec_id'].str.replace("MR ", "MR-")
clark.loc[clark['NAME'].str.contains("HARMONY"), "prec_id"] = clark['prec_id'].str.replace("HARMONY", "HARM")
clark.loc[clark['NAME'].str.contains("T-0"), "prec_id"] = clark['prec_id'].str.replace("T-0", "T-")
clark.loc[clark['NAME'].str.contains("GERMAN"), "prec_id"] = clark['prec_id'].str.replace("GERMAN", "GERM")
clark.loc[clark['NAME'].str.contains("GREEN"), "prec_id"] = clark['prec_id'].str.replace("GREEN", "GREE")
clark['county'] = 'CLARK'
clark['UNIQUE_ID'] = clark['county']+"-"+clark["prec_id"]
clark = clark.to_crs(bound20_reused_counties_gdf.crs)


#Columbiana: Full match
columbiana = gp.read_file("./raw-from-source/boundaries/columbiana_precincts_2022/Voting_Precincts_2022.shp")
columbiana_dict = {"HANOVER TWP NW":'PRECINCT HANOVER TWP NORTH',"WELLSVILLE 2":'PRECINCT WELLSVILLE  2',"PERRY SW":'PRECINCT PERRY TWP SW',
                   "PERRY NORTH":'PRECINCT PERRY TWP NORTH',"LEETONIA":'PRECINCT LEETONIA VIL', "NEGLEY":'PRECINCT MIDDLETON TWP NEGLEY',
                   "ROGERS":'PRECINCT MIDDLETON TWP ROGERS',"ST CLAIR SOUTH":'PRECINCT ST CLAIR TWP SOUTH',
                   "WEST TWP E ROCHESTER":'PRECINCT WEST TWP EAST ROCHESTER',"WELLSVILLE 1":'PRECINCT WELLSVILLE  1'}
columbiana["prec_id"] = "PRECINCT "+columbiana["PRECNAME"].str.upper().str.replace("TOWNSHIP", "TWP")
columbiana.loc[columbiana["PRECNAME"].isin(columbiana_dict.keys()), "prec_id"] = columbiana["PRECNAME"].map(columbiana_dict)
columbiana["county"] = "COLUMBIANA"
columbiana["UNIQUE_ID"] = columbiana["county"]+"-"+columbiana["prec_id"]
columbiana = columbiana.to_crs(bound20_reused_counties_gdf.crs)


#Delaware: Full match
delaware = gp.read_file("./raw-from-source/boundaries/delaware_precincts_2022/Precinct.shp")
delaware['prec_id'] = "PRECINCT " + delaware['PREC_NAME'].str.upper()
delaware["county"] = "DELAWARE"
delaware.loc[delaware["PREC_NAME"].str.contains("COLUMBUS"), "prec_id"] = "PRECINCT "+delaware["PREC_NAME"].str.replace("COLUMBUS ", "COLUMBUS CITY ")
delaware.loc[delaware["PREC_NAME"].str.contains("DUBLIN"), "prec_id"] = "PRECINCT "+delaware["PREC_NAME"].str.replace("DUBLIN ", "DUBLIN CITY ")
delaware.loc[delaware["PREC_NAME"].str.contains("POWELL"), "prec_id"] = "PRECINCT "+delaware["PREC_NAME"].str.replace("POWELL ", "POWELL CITY ")
delaware.loc[delaware["PREC_NAME"].str.contains("SUNBURY"), "prec_id"] = "PRECINCT "+delaware["PREC_NAME"].str.replace("SUNBURY ", "SUNBURY CITY ")
delaware.loc[delaware["PREC_NAME"].str.contains("WESTERVILLE"), "prec_id"] = "PRECINCT "+delaware["PREC_NAME"].str.replace("WESTERVILLE ", "WESTERVILLE CITY ")
delaware["UNIQUE_ID"] = delaware["county"]+"-"+delaware["prec_id"]
delaware = delaware.to_crs(bound20_reused_counties_gdf.crs)


#Geauga: Full Match
geauga = gp.read_file("./raw-from-source/boundaries/geauga_precincts_2022/Precincts.shp")
geauga_prec_dict = {"B-76":"PRECINCT THOMPSON TWP B","B-51":"PRECINCT MONTVILLE TWP B", "C-42":"PRECINCT HAMBDEN TWP C", "A-23":"PRECINCT CHARDON TWP A","D-22":"PRECINCT CHARDON CITY D", "A-44":"PRECINCT HUNTSBURG TWP A", "B-38":"PRECINCT CLARIDON TWP B", "E-56":"PRECINCT MUNSON TWP E",
"D-30":"PRECINCT CHESTER TWP D","AQV-36":"PRECINCT AQUILLA VILLAGE", "B-49":"PRECINCT MIDDLEFIELD TWP B", "C-18":"PRECINCT BURTON TWP C", "A-58":"PRECINCT NEWBURY TWP A", "A-70":"PRECINCT RUSSELL TWP A", "HVV-69":"PRECINCT HUNTING VALLEY VILL", 
"B-47":"PRECINCT MIDDLEFIELD VILL B",
"BUR V-15":"PRECINCT BURTON VILLAGE", "B-64":"PRECINCT PARKMAN TWP B", "A-77":"PRECINCT TROY TWP A", "E-5":"PRECINCT AUBURN TWP E","F-11":"PRECINCT BAINBRIDGE TWP F", "A-75":"PRECINCT THOMPSON TWP A", "A-50":"PRECINCT MONTVILLE TWP A", "B-45":"PRECINCT HUNTSBURG TWP B", 
"A-46":"PRECINCT MIDDLEFIELD VILL A", "A-63":"PRECINCT PARKMAN TWP A", "B-78":"PRECINCT TROY TWP B", "A-16":"PRECINCT BURTON TWP A", "B-17":"PRECINCT BURTON TWP B", "A-37":"PRECINCT CLARIDON TWP A", "B-41":"PRECINCT HAMBDEN TWP B", "A-40":"PRECINCT HAMBDEN TWP A",
"D-43":"PRECINCT HAMBDEN TWP D", "C-25":"PRECINCT CHARDON TWP C","D-26":"PRECINCT CHARDON TWP D", "B-24":"PRECINCT CHARDON TWP B", "A-19":"PRECINCT CHARDON CITY A", "C-21":"PRECINCT CHARDON CITY C", "B-20":"PRECINCT CHARDON CITY B", "B-53":"PRECINCT MUNSON TWP B", 
"C-54":"PRECINCT MUNSON TWP C","F-57":"PRECINCT MUNSON TWP F", "B-59":"PRECINCT NEWBURY TWP B", "C-60":"PRECINCT NEWBURY TWP C", "E-62":"PRECINCT NEWBURY TWP E","D-61":"PRECINCT NEWBURY TWP D", "C-3":"PRECINCT AUBURN TWP C", "B-2":"PRECINCT AUBURN TWP B", 
"A-1":"PRECINCT AUBURN TWP A","D-4":"PRECINCT AUBURN TWP D", "A-6":"PRECINCT BAINBRIDGE TWP A", "C-8":"PRECINCT BAINBRIDGE TWP C", "E-10":"PRECINCT BAINBRIDGE TWP E","H-13":"PRECINCT BAINBRIDGE TWP H","I-14":"PRECINCT BAINBRIDGE TWP I","D-9":"PRECINCT BAINBRIDGE TWP D",
"D-68":"PRECINCT SOUTH RUSSELL VILL D", "B-66":"PRECINCT SOUTH RUSSELL VILL B", "A-65":"PRECINCT SOUTH RUSSELL VILL A", "C-67":"PRECINCT SOUTH RUSSELL VILL C", "E-74":"PRECINCT RUSSELL TWP E", "C-72":"PRECINCT RUSSELL TWP C", "B-71":"PRECINCT RUSSELL TWP B",
"D-73":"PRECINCT RUSSELL TWP D", "C-29":"PRECINCT CHESTER TWP C","G-33":"PRECINCT CHESTER TWP G","H-34":"PRECINCT CHESTER TWP H","I-35":"PRECINCT CHESTER TWP I", "A-27":"PRECINCT CHESTER TWP A", "B-28":"PRECINCT CHESTER TWP B", "E-31":"PRECINCT CHESTER TWP E",
"F-32":"PRECINCT CHESTER TWP F", "B-7":"PRECINCT BAINBRIDGE TWP B","G-12":"PRECINCT BAINBRIDGE TWP G","D-55":"PRECINCT MUNSON TWP D", "A-52":"PRECINCT MUNSON TWP A", "C-39":"PRECINCT CLARIDON TWP C", "A-48":"PRECINCT MIDDLEFIELD TWP A"}
geauga["prec_id"] = "na"
geauga.loc[geauga['PRECINCT2'].isin(geauga_prec_dict.keys()), "prec_id"] = geauga["PRECINCT2"].map(geauga_prec_dict)
geauga["county"] = "GEAUGA"
geauga["UNIQUE_ID"] = geauga["county"]+"-"+geauga["prec_id"]
geauga = geauga.to_crs(bound20_reused_counties_gdf.crs)


#Hamilton: Full Match
hamilton = gp.read_file("./raw-from-source/boundaries/hamilton_precincts_2022/PRECINT2021_0311.shp")
hamilton['prec_id'] = hamilton["PRC_NAME"]
hamilton['county'] = "HAMILTON"
hamilton["UNIQUE_ID"] = hamilton["county"]+"-"+hamilton["prec_id"]
hamilton = hamilton.to_crs(bound20_reused_counties_gdf.crs)


#Hardin: Full Match
hardin = gp.read_file("./raw-from-source/boundaries/hardin_precincts_2022/Hardin_Co_Precincts_2023-04.shp")
hardin["prec_id"] = hardin["precinct"].str.upper()
hardin["county"] = "HARDIN"
hardin["UNIQUE_ID"] = hardin["county"]+"-"+hardin["prec_id"]
hardin = hardin.to_crs(bound20_reused_counties_gdf.crs)


#Hocking: Full Match
hocking = gp.read_file("./raw-from-source/boundaries/hocking_precincts_2022/HOCKING_COUNTY_PRECINCTS_2022.shp")
hocking["prec_id"] = "PRECINCT " + hocking["PRECINCT"]
hocking.loc[hocking["PRECINCT"]=="SALT CREEK", "prec_id"] = "PRECINCT SALTCREEK"
hocking["county"] = "HOCKING"
hocking["UNIQUE_ID"] = hocking["county"]+"-"+hocking["prec_id"]
hocking = hocking.to_crs(bound20_reused_counties_gdf.crs)


#Lake: Full Match
lake = gp.read_file("./raw-from-source/boundaries/lake_precincts_2021/Lake_County_Voting_Precincts_(2021).shp")
lake["prec_id"] = lake["NAME"].str.upper()
lake.loc[~lake["NAME"].str.upper().str.contains("MENTOR-ON"), "prec_id"] = "PRECINCT "+lake["NAME"].str.upper()
lake.loc[lake["NAME"].str.upper().str.contains("MENTOR-ON"), "prec_id"] = lake["NAME"].str.upper().str.slice(stop=-2)+"CITY "+lake["NAME"].str.upper().str.slice(start=-2)
lake["county"] = "LAKE"
lake["UNIQUE_ID"] = lake["county"]+"-"+lake["prec_id"]
lake = lake.to_crs(bound20_reused_counties_gdf.crs)


#Marion: Full Match
marion_aug=gp.read_file("./raw-from-source/boundaries/marion_precincts_2022/MARION_COUNTY_PRECINCTS_CONSOLIDATED_AUG2022.shp")
marion_dec = gp.read_file("./raw-from-source/boundaries/marion_precincts_2022/City_Council_Wards_December2022.shp")
marion_dec["PRCT_NAME"] = "MARION " + marion_dec["WARD_ID"].astype(str)+"-"+marion_dec["Prect"]
marion = gp.GeoDataFrame(pd.concat([marion_aug, marion_dec], ignore_index=True), crs=bound20_reused_counties_gdf.crs)
marion["prec_id"] = "PRECINCT "+marion["PRCT_NAME"]
marion_dict = {'PRECINCT BIG ISLAND':'PRECINCT BIG ISLAND TWP','PRECINCT BOWLING GREEN':'PRECINCT BOWLING GREEN TWP',
               'PRECINCT CLARIDON A':'PRECINCT CLARIDON TWP A','PRECINCT CLARIDON B':'PRECINCT CLARIDON TWP B',
               'PRECINCT GRAND PRAIRIE':'PRECINCT GRAND PRAIRIE TWP','PRECINCT GRAND-SALT':'PRECINCT GRAND SALT',
               'PRECINCT GREEN CAMP':'PRECINCT GREEN CAMP TWP','PRECINCT MARION A':'PRECINCT MARION TWP A','PRECINCT MARION B':'PRECINCT MARION TWP B',
               'PRECINCT MARION C':'PRECINCT MARION TWP C','PRECINCT MARION D':'PRECINCT MARION TWP D','PRECINCT MARION E':'PRECINCT MARION TWP E',
               'PRECINCT MARION F':'PRECINCT MARION TWP F','PRECINCT MONTGOMERY':'PRECINCT MONTGOMERY TWP','PRECINCT PLEASANT A':'PRECINCT PLEASANT TWP A',
               'PRECINCT PLEASANT B':'PRECINCT PLEASANT TWP B','PRECINCT PLEASANT C':'PRECINCT PLEASANT TWP C','PRECINCT PLEASANT D':'PRECINCT PLEASANT TWP D',
               'PRECINCT PROSPECT':'PRECINCT PROSPECT TWP','PRECINCT RICHLAND A':'PRECINCT RICHLAND TWP A','PRECINCT RICHLAND B':'PRECINCT RICHLAND TWP B',
               'PRECINCT SCOTT-TULLY':'PRECINCT SCOTT TULLY','PRECINCT WALDO':'PRECINCT WALDO TWP'}
marion.loc[marion["prec_id"].isin(marion_dict), "prec_id"] = marion["prec_id"].map(marion_dict)
marion["county"] = "MARION"
marion["UNIQUE_ID"] = marion["county"]+"-"+marion["prec_id"]
marion = marion.to_crs(bound20_reused_counties_gdf.crs)


#Mercer: Full Match
mercer = gp.read_file("./raw-from-source/boundaries/mercer_precincts_2023/2023_03_votingprecincts.shp")
mercer["prec_id"] = mercer["NAME"]
mercer["county"] = "MERCER"
mercer["UNIQUE_ID"] = mercer["county"]+"-"+mercer["prec_id"]
mercer = mercer.to_crs(bound20_reused_counties_gdf.crs)


#Muskingum: Full match
muskingum = gp.read_file("./raw-from-source/boundaries/muskingum_precincts_2022/VOTING_PRECINCTS.shp")
muskingum_dict = {'1A': 'PRECINCT ZANESVILLE 1-A', '1B': 'PRECINCT ZANESVILLE 1-B', '2A': 'PRECINCT ZANESVILLE 2-A', '2B': 'PRECINCT ZANESVILLE 2-B', '3A': 'PRECINCT ZANESVILLE 3-A', '3B': 'PRECINCT ZANESVILLE 3-B', 
                  '4A': 'PRECINCT ZANESVILLE 4-A', '4B': 'PRECINCT ZANESVILLE 4-B', '5A': 'PRECINCT ZANESVILLE 5-A', '5B': 'PRECINCT ZANESVILLE 5-B', '5C': 'PRECINCT ZANESVILLE 5-C', '6A': 'PRECINCT ZANESVILLE 6-A', 
                  '6B': 'PRECINCT ZANESVILLE 6-B', '6C': 'PRECINCT ZANESVILLE 6-C', 'NEWTON FULTONHAM': 'PRECINCT FULTONHAM', 'NEWTON IRONSPT': 'PRECINCT IRONSPOT', 'NEWTON MOXAHALA': 'PRECINCT MOXAHALA', 
                  'NEWTON ROLLING PLAINS': 'PRECINCT ROLLING PLAINS', 'NEWTON WHITE COTTAGE': 'PRECINCT WHITE COTTAGE', 'UNION UNION': 'PRECINCT UNION', 'WAYNE DUNCAN FALLS':'PRECINCT DUNCAN FALLS'}
muskingum["prec_id"] = "PRECINCT "+muskingum["PRECINCT"]
muskingum.loc[muskingum["PRECINCT"].isin(muskingum_dict.keys()), "prec_id"] = muskingum["PRECINCT"].map(muskingum_dict)
muskingum["county"] = "MUSKINGUM"
muskingum["UNIQUE_ID"] = muskingum["county"]+"-"+muskingum["prec_id"]
muskingum = muskingum.to_crs(bound20_reused_counties_gdf.crs)


#Tuscarawas: Full match
tuscarawas = gp.read_file("./raw-from-source/boundaries/tuscarawas_precincts_2020_2022/BOE_Precincts_2022/Tusc157_BOE_PctBnd.shp")
tuscarawas_dict_df = pd.read_csv("./raw-from-source/boundaries/tuscarawas_precincts_2020_2022/BOE_Precincts_2022/prec_matching.csv")
tuscarawas_dict = pd.Series(tuscarawas_dict_df["ER"].str.replace("'","").str.replace(",","").str.slice(start=1).values, index = tuscarawas_dict_df["PB"].str.replace("'","").str.replace(",","").str.slice(start=1)).to_dict()
tuscarawas["prec_id"] = "PRECINCT "+tuscarawas["PRECT_NA"]
tuscarawas.loc[tuscarawas["prec_id"].isin(tuscarawas_dict.keys()), "prec_id"] = tuscarawas["prec_id"].map(tuscarawas_dict)
tuscarawas["county"] = "TUSCARAWAS"
tuscarawas["UNIQUE_ID"] = tuscarawas["county"]+"-"+tuscarawas["prec_id"]
tuscarawas = tuscarawas.to_crs(bound20_reused_counties_gdf.crs)


#Check if uniqueness/match fixed
full_match_22_bound=[clark, columbiana, delaware, geauga, hamilton, hardin, hocking, lake, marion,mercer, muskingum, tuscarawas]
check_precid_uniqueness(full_match_22_bound) 


#Brown: Full match 
brown = gp.read_file("./raw-from-source/boundaries/brown_precincts_2023/VOTING-PRECINCT.shp") #do the overlap labels have overlapping shapes also?
brown['prec_id'] = brown['NAME']
brown["county"] = "BROWN"
brown.loc[brown['NAME']=="ABERDEEN", "prec_id"] = "ABERDEEN VILLAGE"
brown.loc[brown['NAME']=="LEWIS HIGGINSPORT", "prec_id"] = "LEWIS/HIGGINSPORT"
brown.loc[brown['NAME']=="MT ORAB STERLING", "prec_id"] = "MOUNT ORAB VILLAGE WEST"
brown.loc[brown['NAME']=="PERRY LAKE LORELEI", "prec_id"] = "PERRY / LAKE LORELEI"
brown.loc[brown['NAME']=="PERRY TWP VILLAGES", "prec_id"] = "PERRY TWP-VILLAGES"
brown.loc[brown['NAME']=="SARDINIA", "prec_id"] = "SARDINIA VILLAGE"
brown["UNIQUE_ID"] = brown["county"] + "-" + brown['prec_id']
brown = brown.dissolve(by = "UNIQUE_ID").reset_index()
brown = brown.to_crs(bound20_reused_counties_gdf.crs)


#Butler: Full match, combine prec
butler = gp.read_file("./raw-from-source/boundaries/butler_precincts_2023/2023Precincts.shp")
butler_dict_df = pd.read_csv("./raw-from-source/boundaries/butler_precincts_2023/prec_name_matching.csv")
butler['prec_id'] = butler['NEW_PREC_N'].str.upper().str.replace("FFTWP", "FAIRFIELD TWP ").str.replace("HANOVER", "HANOVER TWP ").str.replace(
    "LEMON", "LEMON TWP ").str.replace("LIBERTY", "LIBERTY TWP ").str.replace("WC", "WEST CHESTER TWP ")
butler_dict = pd.Series(butler_dict_df["ER"].str.replace("'","").str.replace(",","").str.slice(start=1).values, index = butler_dict_df["PB"].str.replace("'","").str.replace(",","").str.slice(start=1).values).to_dict()
butler.loc[butler["prec_id"].isin(butler_dict.keys()), "prec_id"] = butler["prec_id"].map(butler_dict)
#Dissolve
butler = butler.dissolve(by="prec_id").reset_index()
butler["county"] = "BUTLER"
butler["UNIQUE_ID"] = butler["county"]+"-"+butler["prec_id"]
butler = butler.to_crs(bound20_reused_counties_gdf.crs)


#Clermont: Full match
clermont = gp.read_file("./raw-from-source/boundaries/clermont_precincts_2023/VotingPrecincts.shp")
clermont['prec_id'] = clermont["PRECINCT"].str.upper().str.replace("TWP","TOWNSHIP")
clermont['county'] = "CLERMONT"
clermont["UNIQUE_ID"] = clermont['county']+"-"+clermont["prec_id"]
clermont = clermont.dissolve(by="UNIQUE_ID").reset_index()
clermont = clermont.to_crs(bound20_reused_counties_gdf.crs)


#Cuyahoga: Full match, combine prec
cuyahoga = gp.read_file("./raw-from-source/boundaries/cuyahoga_precincts_2022/Precincts May 2022_region.shp")
cuyahoga_dict_df = pd.read_csv("./raw-from-source/boundaries/cuyahoga_precincts_2022/prec_matching.csv")
cuyahoga_dict = pd.Series(cuyahoga_dict_df["ER"].str.replace("'","").str.replace(",","").str.slice(start=1).values, index=cuyahoga_dict_df["PB"].str.replace("'","").str.replace(",","").str.slice(start=1)).to_dict()
cuyahoga['prec_id'] = cuyahoga["City"].str.upper()+"-"+cuyahoga['Label'].str.slice(stop=-1).str.zfill(2)+"-"+cuyahoga['Label'].str.slice(start=-1)
cuyahoga.loc[cuyahoga["prec_id"].isin(cuyahoga_dict.keys()), "prec_id"] = cuyahoga["prec_id"].map(cuyahoga_dict)
cuyahoga.loc[cuyahoga["prec_id"]=='EACHWOOD-00-E', "prec_id"] = 'BEACHWOOD-00-E'
#Buffer small area, otherwise dissolve fails
cuyahoga["geometry"] = cuyahoga["geometry"].buffer(0.01)
cuyahoga = cuyahoga.dissolve(by="prec_id").reset_index()
cuyahoga['county'] = "CUYAHOGA"
cuyahoga["UNIQUE_ID"] = cuyahoga["county"]+"-"+cuyahoga["prec_id"]
cuyahoga = cuyahoga.to_crs(bound20_reused_counties_gdf.crs)
#print("# prec in PB not in ER: ",len(set(cuyahoga["prec_id"])-set(er22["PRECNAME"][er22["County"]=='Cuyahoga'])))
#print("# prec in ER not in PB: ",len(set(er22["PRECNAME"][er22["County"]=='Cuyahoga'])-set(cuyahoga["prec_id"])))
#print("nunique pb: ", cuyahoga["prec_id"].nunique(),"\nshape PB: ", cuyahoga.shape)
#print("nunique er: ",er22["PRECNAME"][er22["County"]=='Cuyahoga'].nunique())


#Erie: Full match except  {'Bellevue City-Annexed-Refer to Board of Elections'}
erie = gp.read_file("./raw-from-source/boundaries/erie_precincts_2022/Erie_County_Voting_Precincts.shp")
erie["prec_id"] = erie["Precinct_I"]
erie_prec_dict = {'Berlin Twp 1': 'PRECINCT BER TWP #1','Berlin Twp 2': 'PRECINCT BER TWP #2','Berlin Heights Village': 'PRECINCT BER VILL',
                  'Bay View Village': 'PRECINCT BV VILL','Castalia Village': 'PRECINCT CAST VILL','Florence Twp 1': 'PRECINCT FLO TWP #1',
                  'Florence Twp 2': 'PRECINCT FLO TWP #2','Groton Twp': 'PRECINCT GRO TWP','HUR-A': 'PRECINCT HUR A','HUR-B': 'PRECINCT HUR B',
                  'HUR-C': 'PRECINCT HUR C','HUR-D': 'PRECINCT HUR D','HUR-E': 'PRECINCT HUR E','HUR-F': 'PRECINCT HUR F','Huron Twp 1': 'PRECINCT HUR TWP #1',
                  'Huron Twp 2': 'PRECINCT HUR TWP #2','Huron Twp 3': 'PRECINCT HUR TWP #3','Kelleys Island Village': 'PRECINCT KI VILL',
                  'Margaretta Twp 1': 'PRECINCT MAR TWP #1','Margaretta Twp 2': 'PRECINCT MAR TWP #2','Margaretta Twp 3': 'PRECINCT MAR TWP #3',
                  'Margaretta Twp 4': 'PRECINCT MAR TWP #4','Milan Twp 1': 'PRECINCT MIL TWP #1','Milan Twp 2': 'PRECINCT MIL TWP #2',
                  'Milan Village': 'PRECINCT MIL VILL','Oxford Twp': 'PRECINCT OX TWP','Perkins Twp 1': 'PRECINCT PER TWP #1',
                  'Perkins Twp 10': 'PRECINCT PER TWP #10','Perkins Twp 2': 'PRECINCT PER TWP #2','Perkins Twp 3': 'PRECINCT PER TWP #3',
                  'Perkins Twp 4': 'PRECINCT PER TWP #4','Perkins Twp 5': 'PRECINCT PER TWP #5','Perkins Twp 6': 'PRECINCT PER TWP #6',
                  'Perkins Twp 7': 'PRECINCT PER TWP #7','Perkins Twp 8': 'PRECINCT PER TWP #8','Perkins Twp 9': 'PRECINCT PER TWP #9',
                  'SAN-A': 'PRECINCT SAN A','SAN-B': 'PRECINCT SAN B','SAN-C': 'PRECINCT SAN C','SAN-D': 'PRECINCT SAN D','SAN-E': 'PRECINCT SAN E',
                  'SAN-F': 'PRECINCT SAN F','SAN-G': 'PRECINCT SAN G','SAN-H': 'PRECINCT SAN H','SAN-I': 'PRECINCT SAN I','SAN-J': 'PRECINCT SAN J',
                  'SAN-K': 'PRECINCT SAN K','SAN-L': 'PRECINCT SAN L','SAN-M': 'PRECINCT SAN M','SAN-N': 'PRECINCT SAN N','SAN-O': 'PRECINCT SAN O',
                  'SAN-P': 'PRECINCT SAN P','Vermilion 1-A': 'PRECINCT VER 1-A','Vermilion 2-A': 'PRECINCT VER 2-A','Vermilion 2-B': 'PRECINCT VER 2-B',
                  'Vermilion 3-A': 'PRECINCT VER 3-A','Vermilion 3-B': 'PRECINCT VER 3-B','Vermilion Twp 1': 'PRECINCT VER TWP #1',
                  'Vermilion Twp 2': 'PRECINCT VER TWP #2','Vermilion Twp 3': 'PRECINCT VER TWP #3','Vermilion Twp 4': 'PRECINCT VER TWP #4',
                  'Vermilion Twp 5': 'PRECINCT VER TWP #5'}
erie.loc[erie['Precinct_I'].isin(erie_prec_dict.keys()), "prec_id"] = erie["Precinct_I"].map(erie_prec_dict)
#print("Not sure why the plot with everything is missing the shape in the bottom left present when plot without Groton...")
#erie[erie["Precinct_I"].str.contains("Groton")].plot()
#erie[~erie["Precinct_I"].str.contains("Groton")].plot()
#erie.plot()
#erie[erie["Precinct_I"].str.contains("Bellevue")].plot()
#erie[~erie["Precinct_I"].str.contains("Bellevue")].plot()
#Dissolve by prec id so Bellevue goes with Groton
erie.loc[erie["Precinct_I"]=="Bellevue City-Annexed-Refer to Board of Elections","prec_id"] = "PRECINCT GRO TWP"
erie = erie.dissolve(by="prec_id").reset_index()
#Standard
erie["county"] = "ERIE"
erie["UNIQUE_ID"] = erie["county"]+"-"+erie["prec_id"]
erie = erie.to_crs(bound20_reused_counties_gdf.crs)


#Lorain: Full match
lorain = gp.read_file("./raw-from-source/boundaries/lorain_precincts_2023/full_lorain_county_06302023/full_lorain_county_06302023.shp")
lorain_dict_df = pd.read_csv("./raw-from-source/boundaries/lorain_precincts_2023/Lorain County Precinct Matching.xlsx - Lorain Co. Shapefile Precincts.csv")
lorain_dict = pd.Series(lorain_dict_df["SOS PRECINCT NAME"].values, index = lorain_dict_df["PRECINCT"]).to_dict()
lorain_dict['LOC 2-F'] = 'PRECINCT LORAIN CITY 2-F'
lorain_dict['LOC 8-G'] = 'PRECINCT LORAIN CITY 8-G'
lorain_dict['LOC 8-H'] = 'PRECINCT LORAIN CITY 8-H'
lorain_dict['NRC 1-F'] = 'PRECINCT N. RIDGEVILLE 1-F'
lorain_dict['NRC 3-H'] = 'PRECINCT N. RIDGEVILLE 3-H'

lorain["prec_id"] = lorain["PRECINCT"].map(lorain_dict)
lorain = lorain.dissolve(by="prec_id").reset_index()
lorain["county"] = "LORAIN"
lorain["UNIQUE_ID"] = lorain["county"]+"-"+lorain["prec_id"]
lorain = lorain.to_crs(bound20_reused_counties_gdf.crs)


#Lucas: Full match
lucas = gp.read_file("./raw-from-source/boundaries/lucas_precincts_2022/full_lucas_fixed/full_lucas_fixed.shp")
lucas_dict_df = pd.read_csv("./raw-from-source/boundaries/lucas_precincts_2022/Lucas_precinctMatching.csv")
lucas_dict_df.loc[~lucas_dict_df["mismatch comment"].isna(), "shapefile W___P"] = lucas_dict_df["mismatch W___P"]
lucas_dict = pd.Series(lucas_dict_df["SoS precinct Name"].values, index=lucas_dict_df["shapefile W___P"]).to_dict()
lucas["prec_id"] = lucas["W___P"].map(lucas_dict)
lucas.loc[lucas["W___P"]=="Mo10", "prec_id"] = "PRECINCT MONCLOVA 10"
lucas["county"] = "LUCAS"
lucas = lucas.dissolve(by="prec_id").reset_index()
lucas["UNIQUE_ID"] = lucas["county"]+"-"+lucas["prec_id"]
lucas = lucas.to_crs(bound20_reused_counties_gdf.crs)


#Miami: Full match except precinct names are not unique - 87 prec names but 104 geometries...
miami = gp.read_file("./raw-from-source/boundaries/miami_precincts_2022/All_Precincts.shp")
miami["prec_id"] = "PRECINCT "+miami["PrecName"].str.upper()
#DISSOLVE BY PREC
miami = miami.dissolve(by = "prec_id").reset_index()
#MATCH PREC NAME
miami_dict = {'BROWN-FLETCHER':'PRECINCT BROWN/FLETCHER','LAURA WEST':'PRECINCT LAURA','LOST CREEK-CASSTOWN':'PRECINCT LOSTCREEK/CASSTOWN',
              'POTSDAM WEST':'PRECINCT POTSDAM','SPRING CREEK EAST':'PRECINCT SPRINGCREEK EAST','SPRING CREEK WEST':'PRECINCT SPRINGCREEK WEST',
              'TROY 1A':'PRECINCT TROY 1-A','TROY 1B':'PRECINCT TROY 1-B','TROY 1C':'PRECINCT TROY 1-C','TROY 1D':'PRECINCT TROY 1-D',
              'TROY 2A':'PRECINCT TROY 2-A','TROY 2B':'PRECINCT TROY 2-B', 'TROY 2C':'PRECINCT TROY 2-C', 'TROY 3A':'PRECINCT TROY 3-A',
              'TROY 3B':'PRECINCT TROY 3-B', 'TROY 3C':'PRECINCT TROY 3-C', 'TROY 3D':'PRECINCT TROY 3-D', 'TROY 4A':'PRECINCT TROY 4-A',
              'TROY 4B':'PRECINCT TROY 4-B','TROY 4C':'PRECINCT TROY 4-C','TROY 5A':'PRECINCT TROY 5-A','TROY 5B':'PRECINCT TROY 5-B',
              'TROY 5C':'PRECINCT TROY 5-C','TROY 6A':'PRECINCT TROY 6-A','TROY 6B':'PRECINCT TROY 6-B','TROY 6C':'PRECINCT TROY 6-C',
              'TROY 6D':'PRECINCT TROY 6-D'}
miami.loc[miami["PrecName"].str.upper().isin(miami_dict.keys()), "prec_id"] = miami["PrecName"].str.upper().map(miami_dict)
#STANDARD FORMATTING
miami["county"] = "MIAMI"
miami["UNIQUE_ID"] = miami["county"]+"-"+miami["prec_id"]
miami = miami.to_crs(bound20_reused_counties_gdf.crs)


#Montgomery: Full match//more geometries than there are precinct names/in election results... 
montgomery = gp.read_file("./raw-from-source/boundaries/montgomery_precincts_2022/precinct_2022_polygon.shp")
#MATCH PREC NAMES
montgomery_dict_df = pd.read_csv("./raw-from-source/boundaries/montgomery_precincts_2022/prec_name_matching.csv")
montgomery_dict = pd.Series(montgomery_dict_df["ER"].str.replace("'","").str.replace(",","").str.slice(start=1).values, index=montgomery_dict_df["PB"].str.replace("'","").str.replace(",","").str.slice(start=1)).to_dict()
#Set prec id
montgomery["prec_id"] = montgomery["VNAME"].str.upper().str.replace("TOWNSHIP", "TWP")
montgomery.loc[montgomery["VNAME"].isin(montgomery_dict.keys()), "prec_id"] = montgomery["prec_id"].map(montgomery_dict)
montgomery.loc[montgomery["prec_id"]=="JACKSON TWP_NEW LEBANON A","prec_id"] = 'JACK/NEW LEBANON-A'
montgomery.loc[montgomery["prec_id"]=="JACKSON TWP_NEW LEBANON B", "prec_id"] = 'JACK/NEW LEBANON-B'
montgomery.loc[montgomery["prec_id"]=="PERRY TWP_NEW LEBANON", "prec_id"] = 'PER/NEW LEBANON'
#DISSOLVE BY PREC
montgomery = montgomery.dissolve(by="prec_id").reset_index()
#STANDARD FORMATTING
montgomery["county"] = "MONTGOMERY"
montgomery["UNIQUE_ID"] = montgomery["county"]+"-"+montgomery["prec_id"]
montgomery = montgomery.to_crs(bound20_reused_counties_gdf.crs)


#Medina: full match, but more polygons than precincts in ER
medina = gp.read_file("./raw-from-source/boundaries/medina_precincts_2022/precincp22.shp")
#MATCH PREC NAMES
medina["prec_id"] = "PRECINCT "+medina["PREC_NAME"].str.replace("TWP", "TP").str.replace("BRUNSWICK HILLS", "BRHILLS").str.replace("HARRISVILLE TP A",'HARRISVL TP A').str.replace("MEDINA CITY 3C","MEDINA CITY 3-C").str.replace(
    "MONTVILLE TP I","MONTVILLE I").str.replace('HARRISVILLE TWP A',"HARRISVL TP A")
medina.loc[medina["PREC_NAME"]=="HOMER TP", "prec_id"] = 'PRECINCT HOMER TWP'
#DISSOLVE BY PREC
medina = medina.dissolve(by="prec_id").reset_index()
#STANDARD FORMATTING
medina["county"] = "MEDINA"
medina["UNIQUE_ID"] = medina["county"]+"-"+medina["prec_id"]
medina = medina.to_crs(bound20_reused_counties_gdf.crs)


#Pickaway: Full match
#Not clear initially which file in folder to use- precincts has 77 rows where as ER have 43 - must use circleville with rest of state and dissolve extra shapes
pickaway_not_clcvle = gp.read_file("./raw-from-source/boundaries/pickaway_precincts_2022/Precincts_Pickaway_Co_2022.shp").to_crs(bound20_reused_counties_gdf.crs)
pickaway_clcvle = gp.read_file("./raw-from-source/boundaries/pickaway_precincts_2022/Precincts_Circlevile_2022.shp").to_crs(bound20_reused_counties_gdf.crs)
#Not needed for matching but good for info on shapes
pickaway_dict_df = pd.read_csv("./raw-from-source/boundaries/pickaway_precincts_2022/Pickaway_precinctMatching.csv")
pickaway = gp.GeoDataFrame(pd.concat([pickaway_not_clcvle[["NAME","STATE_CODE", "geometry"]],pickaway_clcvle[["NAME","STATE_CODE","geometry"]]], ignore_index=True), crs=bound20_reused_counties_gdf.crs)
pickaway["prec_id"] = pickaway["NAME"]
#print("Pickaway check if na name leads to holes in map or not - it does not")
#pickaway.loc[pickaway["NAME"].isna()].plot()
#pickaway.loc[~pickaway["NAME"].isna()].plot()
#pickaway.plot()
#DISSOLVE BY PREC + cut out NA
pickaway = pickaway.loc[~pickaway["NAME"].isna()].dissolve(by="prec_id").reset_index().to_crs(bound20_reused_counties_gdf.crs)
pickaway["county"] = "PICKAWAY"
pickaway["UNIQUE_ID"] = pickaway["county"]+"-"+pickaway["prec_id"]


#Stark: Full Match
#Andrew instructs to use json labeled Nov 2022 instead of "Voting_Precincts zip"
stark = gp.read_file("./raw-from-source/boundaries/stark_precincts_2022/Stark County Precincts (Nov 2022).json")
stark["prec_id"] = stark["NAME"].str.replace("TWP","").str.replace("  "," ")
stark.loc[stark["NAME"].str.replace("TWP","").str.replace("  "," ").isin(['CANTON 1','CANTON 2','CANTON 3','CANTON 4','CANTON 5','CANTON 6','CANTON 7']), "prec_id"]= stark["NAME"]
stark.loc[stark["NAME"]=="MEYERS LAKE VILLAGE A","prec_id"]='MEYERS LAKE A'
stark["county"] = "STARK"
stark["UNIQUE_ID"] = stark["county"] + "-"+stark["prec_id"]
stark = stark.dissolve(by ="UNIQUE_ID").reset_index()
stark = stark.to_crs(bound20_reused_counties_gdf.crs)

#Check if uniqueness/match fixed
too_many_polygons = [brown, butler, clermont, cuyahoga, erie, miami, montgomery, medina, lorain, lucas, pickaway, stark] 
check_precid_uniqueness(too_many_polygons)


#2020 Wood County from VEST
wood20 = bound20rdh[bound20rdh["COUNTYFP20"]=="173"]


#Get 2022 new precincts drawn in DRA - Draw new precincts in DRA using shapes provided by OE from Wood County
dra = gp.read_file("./dra-district-shapes/POLYGON.shp")
wood_new22 = dra[(dra["NAME"]=='58')|(dra["NAME"]=='65')]
wood_new22 = wood_new22.to_crs(wood20.crs)
wood_new22["UNIQUE_ID"] = 0
wood_new22["prec_code"] = 0
wood_new22.loc[wood_new22["NAME"]=="58", "prec_code"] = 'AEL'
wood_new22.loc[wood_new22["NAME"]=="65", "prec_code"] = 'AEM'
wood_new22.loc[wood_new22["NAME"]=="58", "UNIQUE_ID"] = 'WOOD-PERRYSBURG J'
wood_new22.loc[wood_new22["NAME"]=="65", "UNIQUE_ID"] = 'WOOD-PERRYSBURG S'


#Step 1: remove new wood prec area from 2020 wood prec
wood20_min_new22 = wood20.overlay(wood_new22, how="symmetric_difference")
wood20_min_new22["UNIQUE_ID"] = wood20_min_new22["UNIQUE_ID_1"]
wood20_min_new22.loc[wood20_min_new22["UNIQUE_ID_1"].isna(), "UNIQUE_ID"]=wood20_min_new22["UNIQUE_ID_2"]

#Step 2: add new prec to 2020 shape
wood22 = gp.GeoDataFrame(pd.concat([wood20_min_new22, wood_new22], ignore_index=True), crs = wood20.crs)


wood = wood22.copy()


wood_dict_df = pd.read_csv("./raw-from-source/boundaries/wood_precincts_2022/Wood_precinctMatching.csv")
wood_dict = pd.Series(wood_dict_df["SoS name"].values, index=wood_dict_df["SoS 2022 code"]).to_dict()
wood.loc[wood["prec_code"].isna(), "prec_code"]= wood["PRECINCT20"]
wood["prec_id"] = wood["prec_code"].map(wood_dict)
wood["county"] = "WOOD"
wood["UNIQUE_ID"] = wood["county"]+"-"+wood["prec_id"]
#wood.loc[wood["prec_id"].isna(), "prec_id"] = "PERRYSBURG S"
wood = wood.dissolve(by="UNIQUE_ID").reset_index().to_crs(bound20_reused_counties_gdf.crs)

check_precid_uniqueness([wood])


#Portage: 
portage = gp.read_file("./raw-from-source/boundaries/portage_precincts_2022/precinct_2022.shp")
portage["prec_id"] = portage["NAME"]+"-"+portage["PRECINCT"]
portage_prec_dict_df = pd.read_csv("./raw-from-source/boundaries/portage_precincts_2022/Portage County Precinct Matching_2022.xlsx - Portage Co. Shapefile Precincts.csv")
portage_dict = pd.Series(portage_prec_dict_df["SoS Precinct Match"].values, index=portage_prec_dict_df["NAME"]+"-"+portage_prec_dict_df["PRECINCT"]).to_dict()
portage.loc[portage["prec_id"].isin(portage_dict.keys()), "prec_id"] = portage["prec_id"].map(portage_dict)
portage.loc[portage["NAME"].str.upper()=='HIRAM', "prec_id"] = 'PRECINCT HIRAM VILLAGE'
portage.loc[portage["NAME"].str.upper()=='MANTUA', "prec_id"]= 'PRECINCT MANTUA VILLAGE'
portage.loc[portage["NAME"].str.upper()=='MOGADORE', "prec_id"] = 'PRECINCT MOGADORE VILLAGE'
portage.loc[portage["NAME"].isna(), "prec_id"] = 'PRECINCT SUGAR BUSH KNOLLS'

#Combine Tallmadge multipolygon with Brimfield precincts A and E
tallmadge_multi = portage[portage["NAME"]=="Tallmadge"]
tallmadge_multi=tallmadge_multi.explode(index_parts=True).reset_index()
tallmadge_multi.loc[(tallmadge_multi["NAME"]=="Tallmadge")&(tallmadge_multi["level_1"]==0), "prec_id"] = "PRECINCT BRIMFIELD A"
tallmadge_multi.loc[(tallmadge_multi["NAME"]=="Tallmadge")&(tallmadge_multi["level_1"]==1), "prec_id"] = "PRECINCT BRIMFIELD E"
tallmadge_multi = tallmadge_multi.to_crs(bound20_reused_counties_gdf.crs)
#filter out what will be added back in
portage = portage[~(portage["NAME"]=="Tallmadge")].to_crs(bound20_reused_counties_gdf.crs)
#combine dfs back together
portage = gp.GeoDataFrame(pd.concat([portage, tallmadge_multi], ignore_index=True), crs=bound20_reused_counties_gdf.crs)

#Combine Ravenna null with Ravenna G in accordance with email exchanges
portage.loc[(portage["PRECINCT"].isna())&(portage["NAME"].str.upper().str.contains("RAVENNA")), "prec_id"] = 'PRECINCT RAVENNA TWP G'
#Assign ZZZ to match other 0-pop precs in the file
portage.loc[portage["PRECINCT"]=="Arsenal", "prec_id"] = "ZZZ"

#Dissolve
portage = portage.dissolve(by="prec_id").reset_index()

portage["county"] = "PORTAGE"
portage["UNIQUE_ID"] = portage["county"]+"-"+portage["prec_id"]
portage = portage.to_crs(bound20_reused_counties_gdf.crs)

#Check if uniqueness/match fixed
zzz_inc_county = [portage]
print(set(er22["PRECNAME"][er22["County"]=="Portage"])-set(portage["prec_id"]))
print(set(portage["prec_id"])-set(er22["PRECNAME"][er22["County"]=="Portage"]))


#Add columns to 2020 gdf to enable join
bound20_reused_counties_gdf["UNIQUE_ID_og"] = bound20_reused_counties_gdf["UNIQUE_ID"].copy()
bound20_reused_counties_gdf["UNIQUE_ID"] = bound20_reused_counties_gdf["UNIQUE_ID_code"]
bound20_reused_counties_gdf=bound20_reused_counties_gdf[bound20_reused_counties_gdf["UNIQUE_ID"]!="PORTAGE-ZZZ"]
bound20_reused_counties_gdf["county"] = bound20_reused_counties_gdf["COUNTYNM"].str.upper()


gdfs_2022_w_mods = [clark, columbiana, delaware, geauga, hamilton, hardin, hocking, lake, marion,mercer, muskingum, tuscarawas]+[brown, butler, clermont, cuyahoga, erie, miami, montgomery, medina, lorain, lucas, pickaway, stark]+[wood]+[portage]
print(len(gdfs_2022_w_mods))
#Waiting on manual mod for LORAIN

wip_full_match_list = gdfs_2022_w_mods+[bound20_reused_counties_gdf]
wip_full_match_gdf = gp.GeoDataFrame(pd.concat(wip_full_match_list, ignore_index=True), crs=bound20_reused_counties_gdf.crs)
wip_full_match_gdf = wip_full_match_gdf[["UNIQUE_ID", "prec_id", "county", "COUNTYFP20", "COUNTYNM", "geometry"]]

Compare all precinct data between 2020 and 2022:
2022 
ER unique_id nunique:  8933 
shape:  (8933,) 
prec name nunique:  8760 
prec name w county nunique 8933

2020 
PB NAME20 nunique:  8673 
shape: (8941, 8) 
prec name nunique:  1550 
prec name w county nunique 8941

Compare subset of counties that we expect to match between 2020 and 2022:
prec w/county diff: pb comp er len:  8
prec w/county diff: er comp pb len:  0
prec county diff: er comp pb names:  set()
prec county diff: pb comp er names:  {'ZZZerie', 'ZZZottawa', 'ZZZashtabula', 'ZZZportage', 'ZZZlucas', 'ZZZlake', 'ZZZcuyahoga', 'ZZZlorain'}

Compare post-merge:
shape merged gdf:  (4375, 269) 
unmatched shape:  (8, 269)
county from er not matching:  [nan]
county from bound not matching, not zzz:  []
Number of counties new to 2022//need precinct name matching:  26
number of counties in list:  12
check prec_id for:  CLARK
check prec_id for:  COLUMBIANA
check prec_id for:  DELAWARE
check prec_id for:  GEAUGA
check prec_id for:  HA

## Combine PB and ER

In [2]:
er22.loc[er22["County"].str.lower().isin(counties_reused22_set), "UNIQUE_ID"] = er22["County"].str.upper() + "-"+er22["PRECCODE"]
er22["UNIQUE_ID"] = er22["UNIQUE_ID"].str.upper()
er22["county"] = er22["County"].str.upper()

wip_pber_merge = pd.merge(wip_full_match_gdf, er22, on = ["UNIQUE_ID", "county"], how = "outer", indicator=True)


print("merge unique_id count: ",wip_full_match_gdf["UNIQUE_ID"].nunique(),"\npb_updated unique_id null count: ",len(wip_full_match_gdf["UNIQUE_ID"][wip_full_match_gdf["UNIQUE_ID"].isna()]))
print("merge shape: ",wip_pber_merge.shape)
print("er22 unique_id count: ",er22["UNIQUE_ID"].nunique(),"\ner22 shape: ",er22.shape)


merge = wip_pber_merge.copy()
election_cols = sorted(list(merge.columns[merge.columns.str.startswith("G")]))
merge["COUNTYNM"] = merge["county"]
merge["COUNTYFP"] = merge["county"].str.lower().map(county_name_to_fips_dict)
merge["PRECINCT"] = merge["UNIQUE_ID"].str.split(pat="-").str[1]
merge[election_cols]=merge[election_cols].fillna(0)
assert merge[merge.columns[merge.columns.str.startswith("G")]].isna().any().all()==False

merge = merge[["UNIQUE_ID","COUNTYFP","COUNTYNM","PRECINCT","PRECCODE",]+election_cols+["geometry"]]


#Check that no null values and that UNIQUE precinct identifier is in fact unique
#Check that no null values and that UNIQUE precinct identifier is in fact unique
def check_unique_id_unique(merged_gdf):
    assert merged_gdf["UNIQUE_ID"].isna().any()==False
    assert merged_gdf["UNIQUE_ID"].nunique()==merged_gdf.shape[0]
    return "unique_id is unique"


#State, County, Precinct total vote checks adapted from pdv checks: https://github.com/nonpartisan-redistricting-datahub/pdv-resources/blob/main/pdv_functions.py
def statewide_totals_check(partner_df, partner_name, source_df, source_name, column_list):
    """Compares the totals of two election result dataframes at the statewide total level

    Args:
      partner_df: DataFrame of election results we are comparing against
      source_df: DataFrame of election results we are comparing to
      column_list: List of races that there are votes for
 
    Returns:
      difference list
    """
    print("***Statewide Totals Check***")
    diff_races=[]
    for race in column_list:
        if (partner_df[race].sum()- source_df[race].sum() != 0):
            if race not in diff_races:
                diff_races.append(race)
            print(race+" has a difference of "+str(partner_df[race].sum()-source_df[race].sum())+" votes")
            print("\t"+ partner_name + ": "+str(partner_df[race].sum())+" votes")
            print("\t"+ source_name +": "+str(source_df[race].sum())+" votes")
        #else:
            #print(race + " is equal", "\t both dataframes " + str(partner_df[race].sum()))
    
    if (len(diff_races)==0):
        print("All contests match statewide!")
    elif (len(diff_races)>0):
        print("Contests with differences: ")
    
    return diff_races


def county_totals_check(partner_df, partner_name, source_df, source_name, column_list,county_col,full_print=False):
    """Compares the totals of two election result dataframes at the county level

    Args:
      partner_df: DataFrame of election results we are comparing against
      partner_name: String of what to call the partner in the print statement
      source_df: DataFrame of election results we are comparing to
      source_name: String of what to call the source in the print statement
      column_list: List of races that there are votes for
      county_col: String of the column name that contains county information
      full_print: Boolean specifying whether to print out everything, including counties w/ similarities

    Returns:
      difference list
    """
    
    print("\n***Countywide Totals Check***")
    diff_counties=[]
    for race in column_list:
        diff = partner_df.groupby([county_col]).sum()[race]-source_df.groupby([county_col]).sum()[race]
        for val in diff[diff != 0].index.values.tolist():
            if val not in diff_counties:
                diff_counties.append(val)
        if len(diff[diff != 0]!=0):   
            print(race + " contains differences in these counties:")
            for val in diff[diff != 0].index.values.tolist():
                county_differences = diff[diff != 0]
                print("\t"+val+" has a difference of "+str(county_differences[val])+" votes")
                print("\t\t"+ partner_name + ": "+str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
                print("\t\t"+ source_name +": "+str(source_df.groupby([county_col]).sum().loc[val,race])+" votes")
            if (full_print):
                for val in diff[diff == 0].index.values.tolist():
                    county_similarities = diff[diff == 0]
                    print("\t"+val + ": "+ str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
        #else:
            #print(race + " is equal across all counties")
            #if (full_print):
               # for val in diff[diff == 0].index.values.tolist():
                 #   county_similarities = diff[diff == 0]
                    #print("\t"+val + ": "+ str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
    if (len(diff_counties)==0):
        print("All contests in all counties match!")
    elif (len(diff_counties)>0):
        print("Counties with differences: ")
        
    return diff_counties
        
    
def precinct_votes_check(merged_df,column_list,vest_on_left,name_col,print_level=0):
    """Checks a merged dataframe with two election results at the precinct level

    Args:
      merged_df: DataFrame with one set of election results joined to another
      column_list: List of races that there are votes for
      vest_on_left: Boolean specifying whether VEST data is on the left side of merged_df
      name_col: String of the column name to refer to precincts when a difference occurs
      print_level: Integer that specifies how large the vote difference in a precinct must be to be printed

    Returns:
      list of differences
    """
    print("\n***Precinct Totals Check***")
    merged_df = merged_df.sort_values(by=[name_col],inplace=False)
    matching_rows = 0
    different_rows = 0
    diff_list=[]
    diff_values = []
    max_diff = 0
    for index,row in merged_df.iterrows():
        same = True
        for i in column_list:
            left_data = i + "_x"
            right_data = i + "_y"
            
            if ((row[left_data] is None) or (row[right_data] is None) or (np.isnan(row[right_data])or(np.isnan(row[left_data])))):
                print("FIX NaN value at: ", row[name_col])
            
            diff = abs(row[left_data]-row[right_data])
            if (diff>0):
                same = False
                diff_values.append(abs(diff))
                if (diff>max_diff):
                    max_diff = diff
            if(diff>print_level):
                if (vest_on_left):
                    print(i, "{:.>72}".format(row[name_col]), "(V)","{:.>5}".format(int(row[left_data]))," (S){:.>5}".format(int(row[right_data])),"(D):{:>5}".format(int(row[left_data]-row[right_data])))                           
                else:
                    print(i, "{:.>72}".format(row[name_col]), "(S)","{:.>5}".format(int(row[left_data]))," (V){:.>5}".format(int(row[right_data])),"(D):{:>5}".format(int(row[left_data]-row[right_data])))
        if(same != True):
            different_rows +=1
            diff_list.append(row[name_col])
        else:
            matching_rows +=1

    print("\nThere are ", len(merged_df.index)," total rows")
    
    if(len(diff_values)!=0):
        print(matching_rows," of these rows are the same")
        print("\nAll precincts containing differences:")
        print("The average difference is: ", str(sum(diff_values)/len(diff_values)))
        print("\nThe max difference between any one shared column in a row is: ", max_diff)
        count_big_diff = len([i for i in diff_values if i > 10])
        print("There are ", str(count_big_diff), "precinct results with a difference greater than 10")
    else:
        print(matching_rows," of these rows are the same")
    
    diff_list.sort()
    
    return diff_list


def run_all_checks(partner_df, partner_name, source_df, source_name, column_list,county_col,full_print=False, prec_check=True):
    print(check_unique_id_unique(source_df))
    #Running inner join because of expected nan value for ZZZ precincts
    merged_df = pd.merge(source_df, partner_df, on = ["UNIQUE_ID"], how = "inner", indicator=True)
    vest_on_left = False
    name_col = "UNIQUE_ID"
    #All matches statewide and county levels
    statewide_totals_check(partner_df, partner_name, source_df, source_name, column_list)
    county_totals_check(partner_df, partner_name, source_df, source_name, column_list,county_col,full_print=False)
    if prec_check ==True:
        precinct_votes_check(merged_df,column_list,vest_on_left,name_col,print_level=0)
    
    
partner_df = er22.copy()
partner_df["COUNTYNM"] = partner_df["County"].str.upper()
partner_name = "original ER 22"
source_df = merge.copy()
source_name = "PBER 22"
county_col = "COUNTYNM"
#---
print("\nchecking statewide gdf:")
run_all_checks(partner_df, partner_name, source_df, "merge gdf", list(source_df.columns[source_df.columns.str.startswith("G")]),county_col,full_print=False, prec_check=True)
#-----

merge unique_id count:  8941 
pb_updated unique_id null count:  0
merge shape:  (8941, 265)
er22 unique_id count:  8933 
er22 shape:  (8933, 260)

checking statewide gdf:
unique_id is unique
***Statewide Totals Check***
All contests match statewide!

***Countywide Totals Check***
All contests in all counties match!

***Precinct Totals Check***

There are  8933  total rows
8933  of these rows are the same


## Split precinct boundaries by district

In [3]:
#Load in district boundary files
cong_shp = gp.read_file("./raw-from-source/2022_districts/oh_cong_adopted_2022/oh_cong_adopted_2022.shp").to_crs(merge.crs)
cong_shp["CONG_DIST_2"] = cong_shp["CONG_DIST"].copy()
cong_shp = cong_shp.drop("CONG_DIST", axis=1)
sl_shp = gp.read_file("./raw-from-source/2022_districts/oh_sldl_adopted_2022/FEB_24_2022_HD_SHP.shp").to_crs(merge.crs)
su_shp = gp.read_file("./raw-from-source/2022_districts/oh_sldu_adopted_2022/FEB_24_2022_SD_SHP.shp").to_crs(merge.crs)


#Create gdfs with appropriate elections
merge_for_splits = merge.copy()
merge_cong = merge_for_splits[['UNIQUE_ID', 'PRECINCT', 'PRECCODE', 'COUNTYNM','COUNTYFP']+list(merge_for_splits.columns[merge_for_splits.columns.str.startswith("GCON")])+['geometry']].reset_index()
merge_sl = merge_for_splits[['UNIQUE_ID', 'PRECINCT', 'PRECCODE', 'COUNTYNM','COUNTYFP']+list(merge_for_splits.columns[merge_for_splits.columns.str.startswith("GSL")])+['geometry']].reset_index()
merge_su = merge_for_splits[['UNIQUE_ID', 'PRECINCT', 'PRECCODE', 'COUNTYNM','COUNTYFP']+list(merge_for_splits.columns[merge_for_splits.columns.str.startswith("GSU")])+['geometry']].reset_index()
assert cong_shp.crs == sl_shp.crs == su_shp.crs == merge.crs

In [4]:
#Find splits
def get_contest_dist_dict(df, contest):
    if contest =="GCON":
        col_to_dist_dict = pd.Series(df.columns[df.columns.str.startswith(contest)].str.slice(start=4,stop=6).values, index=df.columns[df.columns.str.startswith(contest)]).to_dict()
    elif contest =="GSL":
        col_to_dist_dict = pd.Series(df.columns[df.columns.str.startswith(contest)].str.slice(start=3,stop=6).values, index=df.columns[df.columns.str.startswith(contest)]).to_dict()
    elif contest =="GSU":
        col_to_dist_dict = pd.Series(df.columns[df.columns.str.startswith(contest)].str.slice(start=3,stop=5).values, index=df.columns[df.columns.str.startswith(contest)]).to_dict()
    
    return col_to_dist_dict


def get_prec_to_dist_list_dict(df, contest):
    col_dist_dict = get_contest_dist_dict(df, contest)
    df["DIST"] = 0
    #Instead of iterating over columns, iterate over districts?
    for col in df.columns[df.columns.str.startswith(contest)]:
        df.loc[(df[col]>0)&(df["DIST"]==0), "DIST"] = col_dist_dict.get(col)

        df.loc[(df[col]>0)&(df["DIST"]!=0), "DIST"] = df["DIST"][(df[col]>0)]+", "+col_dist_dict.get(col)
    
    dist_list_dict = pd.Series(df["DIST"].values, index=df["UNIQUE_ID"])
    
    return dist_list_dict


def get_df_with_split_prec_indicator(df, contest):
    t = time.process_time()
    
    df["DIST"] = df["UNIQUE_ID"].map(get_prec_to_dist_list_dict(df, contest))
    
    df["DIST_index"]=0
    df["DIST_len_assignment"]=0
    index_to_set_dict = {}
    index_to_len_dict = {}

    for i in range(len(df)):
        df["DIST_index"].loc[i] = i
        if df.loc[i,"DIST"]==0:
            district_list = [0]
            index_to_len_dict[i] = 0
        else:
            district_list = list(set(df.loc[i,"DIST"].split(", ")))
            index_to_len_dict[i] = len(district_list)
        index_to_set_dict[i] = district_list
        
        if len(district_list)==1:
            district_item = district_list[0]
            index_to_set_dict[i] = district_item
    df["DIST_set"] = df["DIST_index"].map(index_to_set_dict)
   
    df["DIST_len_assignment"] = df["DIST_index"].map(index_to_len_dict)
    prec_to_dist_dict = pd.Series(df["DIST_set"].values, index = df["UNIQUE_ID"])
    
    elapsed_time = time.process_time() - t
    print("time to run fxn: ", elapsed_time)
    
    return df, prec_to_dist_dict


#Grab splits
cong_pber = merge_cong.copy()
cong_pber_splits = get_df_with_split_prec_indicator(cong_pber, "GCON")
cong_pber_splits_gdf = cong_pber_splits[0]
cong_pber_splits_gdf["CONG_DIST"] = 0
cong_pber_splits_gdf["CONG_DIST"] = cong_pber_splits_gdf["UNIQUE_ID"].map(cong_pber_splits[1])
cong_splits_dict = pd.Series(cong_pber_splits_gdf["CONG_DIST"][cong_pber_splits_gdf["DIST_len_assignment"]>1].values, index = cong_pber_splits_gdf["UNIQUE_ID"][cong_pber_splits_gdf["DIST_len_assignment"]>1]).to_dict()


sl_pber_splits = get_df_with_split_prec_indicator(merge_sl, "GSL")
sl_pber_splits_gdf = sl_pber_splits[0]
sl_pber_splits_gdf["SL_DIST"] = 0
sl_pber_splits_gdf["SL_DIST"] = sl_pber_splits_gdf["UNIQUE_ID"].map(sl_pber_splits[1])
sl_splits_dict = pd.Series(sl_pber_splits_gdf["SL_DIST"][sl_pber_splits_gdf["DIST_len_assignment"]>1].values, index = sl_pber_splits_gdf["UNIQUE_ID"][sl_pber_splits_gdf["DIST_len_assignment"]>1]).to_dict()


su_pber_splits = get_df_with_split_prec_indicator(merge_su, "GSU")
su_pber_splits_gdf = su_pber_splits[0]
su_pber_splits_gdf["SU_DIST"] = 0
su_pber_splits_gdf["SU_DIST"] = su_pber_splits_gdf["UNIQUE_ID"].map(su_pber_splits[1])
su_splits_dict = pd.Series(su_pber_splits_gdf["SU_DIST"][su_pber_splits_gdf["DIST_len_assignment"]>1].values, index = su_pber_splits_gdf["UNIQUE_ID"][su_pber_splits_gdf["DIST_len_assignment"]>1]).to_dict()


time to run fxn:  0.5917090000000016
time to run fxn:  0.8777309999999972
time to run fxn:  0.6066339999999997


In [8]:
#Create gdfs with splits
#generates dataframes with the "lost votes" from splits
def generate_differences_df(df_compare_against, df_compare_to, unique_ID_col, races_list, drop_empty = False):
    
    df_compare_against = df_compare_against[[unique_ID_col]+races_list]
    df_compare_to = df_compare_to[[unique_ID_col]+races_list]
 
    grouped_compare_against = df_compare_against.groupby(unique_ID_col).sum()
    grouped_compare_to = df_compare_to.groupby(unique_ID_col).sum()
    
    grouped_compare_against.reset_index(inplace = True, drop = False)
    grouped_compare_to.reset_index(inplace = True, drop = False)
    diffs = grouped_compare_against.set_index(unique_ID_col).subtract(grouped_compare_to.set_index(unique_ID_col))
    
    diffs["Tot_Votes"] = diffs[races_list].sum(axis=1)
    
    if drop_empty:
        diffs = diffs.loc[~(diffs==0).all(axis=1)]
        diffs = diffs.loc[:, (diffs != 0).any(axis=0)]
    return diffs


def district_splits_comb(level, splits_dict, elections_gdf, district_gdf, unique_ID_col, district_gdf_ID, races_list, elections_gdf_dist_ID, fill_level = 2):
    '''
    Function to split precincts across districts that splits a precinct across the entire district map.
    Previous iterations of this code only split precincts by the districts in which votes were recorded.
    In some instances, that led to holes in the map, due to districts where no votes were recorded in a precinct, but where an intersection occurred.
    '''
    t = time.process_time()
    # Intersect the elections gdf with the district gdf
    need_splits = elections_gdf[elections_gdf[unique_ID_col].isin(list(splits_dict.keys()))]
    others = elections_gdf[~elections_gdf[unique_ID_col].isin(list(splits_dict.keys()))]
    
    pre_splits_copy = need_splits.copy(deep = True)
    
    test_join = gp.overlay(need_splits, district_gdf, how = "intersection")
    
    # Assign a district column, using the district shapefile
    test_join[elections_gdf_dist_ID] = test_join[district_gdf_ID]
    
    # Filter the intersection down to the precinct, district pairs we need
    clean_votes = test_join.copy(deep = True)
    
    clean_votes[unique_ID_col+"_new"] = clean_votes[unique_ID_col]
    
    # Remove the others and hold on to these to be merged later
    for index, row in clean_votes.iterrows():
        clean_votes.at[index, unique_ID_col+"_new"] = row[unique_ID_col]+"-("+level + "-" + row[district_gdf_ID].zfill(fill_level) + ")" 
        for column in test_join:
            if column in races_list and row[elections_gdf_dist_ID].zfill(fill_level) not in column:
                clean_votes.at[index, column] = 0 
        
    lost_votes_df = generate_differences_df(pre_splits_copy, clean_votes, unique_ID_col, races_list, True)
    
    clean_votes.drop(unique_ID_col, axis = 1, inplace = True)
    clean_votes.rename(columns = {unique_ID_col+"_new":unique_ID_col}, inplace = True)
    clean_votes = clean_votes[list(others.columns)]
        
    elections_gdf = gp.GeoDataFrame(pd.concat([clean_votes, others]), crs = elections_gdf.crs)
    elections_gdf.reset_index(drop=True,inplace=True)
    
    elapsed_time = time.process_time() - t
    print("time to run fxn: ", elapsed_time)
    
    return elections_gdf, lost_votes_df



def clean_na_dist_assignments(elections_gdf, district_gdf, unique_ID_col, elections_gdf_dist_ID, district_gdf_ID):
    t = time.process_time()
    if elections_gdf[elections_gdf[elections_gdf_dist_ID]==0].shape[0]==0:
        return elections_gdf
    
    original_crs = elections_gdf.crs
    elections_gdf = elections_gdf.to_crs(3857)
    
    district_gdf = district_gdf.to_crs(3857)
    
    dist_clean = gp.overlay(elections_gdf[elections_gdf[elections_gdf_dist_ID]==0], district_gdf, how = "intersection")
    dist_clean['area'] = dist_clean.area
    na_assignment_dict = {}
    for val in dist_clean[unique_ID_col].unique():
        assignment = dist_clean.loc[dist_clean[unique_ID_col] == val].nlargest(1, 'area')[district_gdf_ID].values[0]
        na_assignment_dict[val] = assignment
    elections_gdf[elections_gdf_dist_ID] = elections_gdf[unique_ID_col].map(na_assignment_dict).fillna(elections_gdf[elections_gdf_dist_ID])    
    elections_gdf = elections_gdf.to_crs(original_crs)
   
    elapsed_time = time.process_time() - t
    print("time to run fxn: ", elapsed_time)
    
    return elections_gdf


#Create gdfs with split geometries
cong_splits = district_splits_comb("CON", cong_splits_dict, cong_pber_splits_gdf, cong_shp, "UNIQUE_ID", "CONG_DIST_2", list(merge.columns[merge.columns.str.contains("GCON")]), "CONG_DIST", fill_level = 2)
cong_elections_gdf = cong_splits[0]
cong_lost_votes = cong_splits[1]

sl_dist_splits = district_splits_comb("SL", sl_splits_dict, sl_pber_splits_gdf, sl_shp, "UNIQUE_ID", "DISTRICT", list(merge.columns[merge.columns.str.contains("GSL")]), "SL_DIST", fill_level = 3)
sl_elections_gdf = sl_dist_splits[0]
sl_lost_votes_gdf = sl_dist_splits[1]

#Not all SLDU districts held races, so it makes sense that some did not get a district assignment
su_dist_splits = district_splits_comb("SU", su_splits_dict, su_pber_splits_gdf, su_shp, "UNIQUE_ID", "DISTRICT", list(merge.columns[merge.columns.str.contains("GSU")]), "SU_DIST", fill_level = 2)
su_elections_gdf = su_dist_splits[0]
su_lost_votes_gdf = su_dist_splits[1]

time to run fxn:  0.18985299999999938
time to run fxn:  0.870508000000001
time to run fxn:  0.1820589999999953


In [9]:
#Note: SL lost votes not empty - 55 and 56 are assigned the same precinct, therefore was not split... Solution is to provide a file with no splits as well.


#Clean NA DIST Assignments - because CONG and SU dist all assigned, don't need cleaned, just returns same gdf
cleaned_cong_gdf = clean_na_dist_assignments(cong_elections_gdf, cong_shp, "UNIQUE_ID", "CONG_DIST", "CONG_DIST_2")
cleaned_sl_gdf = clean_na_dist_assignments(sl_elections_gdf, sl_shp, "UNIQUE_ID", "SL_DIST", "DISTRICT")
cleaned_su_gdf = clean_na_dist_assignments(su_elections_gdf, su_shp, "UNIQUE_ID", "SU_DIST", "DISTRICT")


#Check results for split files
print("\nchecking CONG w splits gdf:")
run_all_checks(partner_df, partner_name, cleaned_cong_gdf, "cong split", list(cleaned_cong_gdf.columns[cleaned_cong_gdf.columns.str.startswith("G")]),county_col,full_print=False, prec_check=True)
print("\nchecking SLDL w splits gdf:   - expect mismatch for dist 55 contests as identified above")
run_all_checks(partner_df, partner_name, cleaned_sl_gdf, "sl split", list(cleaned_sl_gdf.columns[cleaned_sl_gdf.columns.str.startswith("G")]),county_col,full_print=False, prec_check=True)
print("\nchecking SLDU w splits gdf")
run_all_checks(partner_df, partner_name, cleaned_su_gdf, "su split", list(cleaned_su_gdf.columns[cleaned_su_gdf.columns.str.startswith("G")]),county_col,full_print=False, prec_check=True)

time to run fxn:  5.059994000000003
time to run fxn:  5.480855000000005
time to run fxn:  17.569657999999997

checking CONG w splits gdf:
unique_id is unique
***Statewide Totals Check***
All contests match statewide!

***Countywide Totals Check***
All contests in all counties match!

***Precinct Totals Check***

There are  8898  total rows
8898  of these rows are the same

checking SLDL w splits gdf:   - expect mismatch for dist 55 contests as identified above
unique_id is unique
***Statewide Totals Check***
GSL055DZOR has a difference of 15.0 votes
	original ER 22: 13167 votes
	sl split: 13152.0 votes
GSL055RLIP has a difference of 58.0 votes
	original ER 22: 39170 votes
	sl split: 39112.0 votes
Contests with differences: 

***Countywide Totals Check***
GSL055DZOR contains differences in these counties:
	WARREN has a difference of 15.0 votes
		original ER 22: 13167 votes
		sl split: 13152.0 votes
GSL055RLIP contains differences in these counties:
	WARREN has a difference of 58.0 votes

## Finalize formatting for PBER GDFs

In [10]:
#Unsplit file, all elections
merge_all = merge[['UNIQUE_ID', 'PRECINCT', 'PRECCODE', 'COUNTYNM','COUNTYFP']+list(merge.columns[merge.columns.str.startswith("G")])+['geometry']].reset_index()
merge_all["VTDST22"] = merge_all["COUNTYFP"]+merge_all["PRECCODE"]
all_gdf = merge_all[['UNIQUE_ID', "VTDST22",'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE']+sorted(merge_all.columns[merge_all.columns.str.startswith("G")])+["geometry"]]


#Unsplit file with only statewide results
merge_st = merge[list(merge.columns[~(merge.columns.str.startswith("GCON"))&~(merge.columns.str.startswith("GSL"))&~(merge.columns.str.startswith("GSU"))])].reset_index(drop=True)
merge_st["VTDST22"] = merge_st["COUNTYFP"]+merge_st["PRECCODE"]
st_gdf = merge_st[['UNIQUE_ID', "VTDST22",'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE']+sorted(merge_st.columns[merge_st.columns.str.startswith("G")])+["geometry"]]


#Cong splits
cleaned_cong_gdf["VTDST22"] = cleaned_cong_gdf["COUNTYFP"]+cleaned_cong_gdf["PRECCODE"]
cong_gdf = cleaned_cong_gdf[['UNIQUE_ID', "VTDST22",'CONG_DIST','COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE']+sorted(cleaned_cong_gdf.columns[cleaned_cong_gdf.columns.str.startswith("G")])+["geometry"]]


#SLDU splits
cleaned_su_gdf["SLDU_DIST"] = cleaned_su_gdf["SU_DIST"]
cleaned_su_gdf["VTDST22"] = cleaned_su_gdf["COUNTYFP"]+cleaned_su_gdf["PRECCODE"]
su_gdf = cleaned_su_gdf[['UNIQUE_ID', "VTDST22",'SLDU_DIST', 'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE']+sorted(cleaned_su_gdf.columns[cleaned_su_gdf.columns.str.startswith("G")])+["geometry"]]


#SLDL splits
cleaned_sl_gdf["SLDL_DIST"] = cleaned_sl_gdf["SL_DIST"].str.zfill(3)
cleaned_sl_gdf["VTDST22"] = cleaned_sl_gdf["COUNTYFP"]+cleaned_sl_gdf["PRECCODE"]
sl_gdf = cleaned_sl_gdf[['UNIQUE_ID', "VTDST22", 'SLDL_DIST', 'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE']+sorted(cleaned_sl_gdf.columns[cleaned_sl_gdf.columns.str.startswith("G")])+["geometry"]]

In [17]:
cong_gdf.shape

(8977, 38)

In [15]:
su_gdf.shape

(8970, 37)

In [16]:
sl_gdf.shape

(9069, 179)

In [11]:
level_race_name_dict = {"CONG_DIST":"CON",
    "SLDL_DIST":"SL",
    "SLDU_DIST":"SU"}


def district_assignment_errors(election_results_df, level):
    '''
    This function will check whether the votes in a precinct match the district assignment
    Note: As written, this will only work for fully numeric district assignments
    '''
    
    # This boolean is used to help clean up the print statements
    any_error = False
    # Convert from the name of the district assignment column, to how it is referred to in column names
    finding = level_race_name_dict[level]
    # Iterate over the dataframe row-by-row
    for index,row in election_results_df.iterrows():
        # Get the district assignment for that row
        district_assignment = row[level]
        # Iterate over every row column by column
        for val in row.index:
            # If the name for the type of district is in the column name and there are non-zero votes
            if finding in val and row[val] != 0 and val!= level and val not in ["SLDU_DIST","SLDL_DIST","CONG_DIST", "SCONG_DIST"]:
                # Grab the numbers for district assignments
                regex_string = finding+'\d*'
                # The len(finding) part is needed here as sometimes there is more than one digit to the district
                col_district = re.findall(regex_string, val)[0][len(finding):]
                # Makes sure that "CON", "SL", or "SU" wasn't found in a name (more than 5 characters into the col name)
                if (val.find(finding) < 5):
                    # If the district number in the column name doesn't equal the district name in the assignment column
                    if (col_district != district_assignment.zfill(2)):
                        print(val.find(finding))
                        print(re.findall(regex_string, val))
                        if not(any_error):
                            print("***ERROR SPOTTED***")
                        print("District Assignment: ", district_assignment)
                        print("Value" , col_district)
                        print("Column", val)
                        print("Number of votes", row[val])
                        print(row["UNIQUE_ID"])
                        print(" ")
                        any_error = True
    return any_error


assert(not(district_assignment_errors(cong_gdf, "CONG_DIST"))), "Bad CONG_DIST assignment"
print("CONFIRMED: Actual cong votes match CONG_DIST assignment")

assert(not(district_assignment_errors(sl_gdf, "SLDL_DIST"))), "Bad SLDL_DIST assignment"
print("CONFIRMED: Actual sl votes match SLDL_DIST assignment")

assert(not(district_assignment_errors(su_gdf, "SLDU_DIST"))), "Bad SLDU_DIST assignment"
print("CONFIRMED: Actual su votes match SLDU_DIST assignment") 

CONFIRMED: Actual cong votes match CONG_DIST assignment
CONFIRMED: Actual sl votes match SLDL_DIST assignment
CONFIRMED: Actual su votes match SLDU_DIST assignment


In [12]:
break 


SyntaxError: 'break' outside loop (<ipython-input-12-a17de20832ec>, line 1)

## Export files

In [18]:
all_gdf.to_file("./oh_2022_gen_prec_shp_wip/oh_2022_gen_prec_no_splits.shp")
st_gdf.to_file("./oh_2022_gen_prec_shp_wip/oh_2022_gen_prec_st.shp")
cong_gdf.to_file("./oh_2022_gen_prec_shp_wip/oh_2022_gen_prec_cong.shp")
su_gdf.to_file("./oh_2022_gen_prec_shp_wip/oh_2022_gen_prec_sldu.shp")
sl_gdf.to_file("./oh_2022_gen_prec_shp_wip/oh_2022_gen_prec_sldl.shp")

## Notes on what comes next for these files
- Load in to separate notebook to clean, re-export and disaggregate

# Match vote/dist assignment fxn

In [None]:
level_race_name_dict = {"CONG_DIST":"CON",
    "SCONG_DIST":"CON",
    "SLDL_DIST":"SL",
    "SLDU_DIST":"SU"}


def district_assignment_errors(election_results_df, level):
    '''
    This function will check whether the votes in a precinct match the district assignment
    Note: As written, this will only work for fully numeric district assignments
    '''
    
    # This boolean is used to help clean up the print statements
    any_error = False
    # Convert from the name of the district assignment column, to how it is referred to in column names
    finding = level_race_name_dict[level]
    # Iterate over the dataframe row-by-row
    for index,row in election_results_df.iterrows():
        # Get the district assignment for that row
        district_assignment = row[level]
        # Iterate over every row column by column
        for val in row.index:
            # If the name for the type of district is in the column name and there are non-zero votes
            if finding in val and row[val] != 0 and val!= level and val not in ["SLDU_DIST","SLDL_DIST","CONG_DIST", "SCONG_DIST"]:
                # Grab the numbers for district assignments
                regex_string = finding+'\d*'
                # The len(finding) part is needed here as sometimes there is more than one digit to the district
                col_district = re.findall(regex_string, val)[0][len(finding):]
                # Makes sure that "CON", "SL", or "SU" wasn't found in a name (more than 5 characters into the col name)
                if (val.find(finding) < 5):
                    # If the district number in the column name doesn't equal the district name in the assignment column
                    if (col_district != district_assignment.zfill(2)):
                        print(val.find(finding))
                        print(re.findall(regex_string, val))
                        if not(any_error):
                            print("***ERROR SPOTTED***")
                        print("District Assignment: ", district_assignment)
                        print("Value" , col_district)
                        print("Column", val)
                        print("Number of votes", row[val])
                        print(row["UNIQUE_ID"])
                        print(" ")
                        any_error = True
    return any_error


assert(not(district_assignment_errors(su_gdf, "SLDU_DIST"))), "Bad SLDU_DIST assignment"
print("CONFIRMED: Actual su votes match SLDU_DIST assignment") 

In [None]:
assert(not(district_assignment_errors(cong_gdf, "CONG_DIST"))), "Bad CONG_DIST assignment"
print("CONFIRMED: Actual cong votes match CONG assignment") 

In [None]:
assert(not(district_assignment_errors(sl_gdf, "SLDL_DIST"))), "Bad SLDL_DIST assignment"
print("CONFIRMED: Actual sldl votes match SLDL assignment")

In [None]:
su_pber_splits_gdf[list(su_pber_splits_gdf.columns[su_pber_splits_gdf.columns.str.startswith("GSU03")|su_pber_splits_gdf.columns.str.startswith("GSU25")])][su_pber_splits_gdf["UNIQUE_ID"]=="FRANKLIN-BAV"]

In [None]:
merge_su[list(merge_su.columns[merge_su.columns.str.startswith("GSU03")|merge_su.columns.str.startswith("GSU25")])][merge_su["UNIQUE_ID"]=="FRANKLIN-BAV"]

In [None]:
su_gdf[(su_gdf["SLDU_DIST"]=="3")&(su_gdf["GSU25RWYS"]==6)]

In [None]:
cleaned_su_gdf.shape

In [None]:
cleaned_su_gdf[cleaned_su_gdf["UNIQUE_ID"]=="FRANKLIN-BAV"]

In [None]:
#|cleaned_su_gdf.columns.str.startswith("GSU25")
list(cleaned_su_gdf.columns[cleaned_su_gdf.columns.str.startswith("GSU03")|cleaned_su_gdf.columns.str.startswith("GSU03")])

In [None]:
cleaned_su_gdf[list(cleaned_su_gdf.columns[cleaned_su_gdf.columns.str.startswith("GSU03")|cleaned_su_gdf.columns.str.startswith("GSU25")])][cleaned_su_gdf["UNIQUE_ID"]=="FRANKLIN-BAV"]

In [None]:
su_pber_splits_gdf.shape

In [None]:
cleaned_su_gdf[list(cleaned_su_gdf.columns[cleaned_su_gdf.columns.str.startswith("GSU03")])][cleaned_su_gdf["UNIQUE_ID"]=="FRANKLIN-BAV"]

In [None]:
su_splits_dict

In [None]:
cleaned_su_gdf[(cleaned_su_gdf["DIST_len_assignment"]==0)&(cleaned_su_gdf["DIST"]!=0)]

In [None]:
su_splits_dict

In [None]:
raw_dict_su = get_prec_to_dist_list_dict(su_pber_splits_gdf, "GSU").to_dict()

In [None]:
raw_dict_su

In [None]:
su_pber_splits = get_df_with_split_prec_indicator(merge_su, "GSU")
su_pber_splits_gdf = su_pber_splits[0]
su_pber_splits_gdf["SU_DIST"] = 0
su_pber_splits_gdf["SU_DIST"] = su_pber_splits_gdf["UNIQUE_ID"].map(su_pber_splits[1])
su_splits_dict = pd.Series(su_pber_splits_gdf["SU_DIST"][su_pber_splits_gdf["DIST_len_assignment"]>1].values, index = su_pber_splits_gdf["UNIQUE_ID"][su_pber_splits_gdf["DIST_len_assignment"]>1]).to_dict()


In [None]:
su_pber_splits[1][su_pber_splits[1]!=0].value_counts()

In [None]:
df["DIST_index"].loc[5000]

# PICK BACK UP HERE FOR DISTRICT DEBUGGING

In [None]:
#def get_df_with_split_prec_indicator(df, contest):
df = merge_su.copy()
contest="GSU"
t = time.process_time()

df["DIST"] = df["UNIQUE_ID"].map(get_prec_to_dist_list_dict(df, contest))

df["DIST_index"]=0
df["DIST_len_assignment"]=0
index_to_set_dict = {}
index_to_len_dict = {}

#8933=Len of gdf outside of ZZZ precs//keeping as much as possible out of for loop for speed
#CHANGE RANGE LEN
for i in range(len(df)): #Modify to be len gdf instead of len != 0

    #FIX DIST_index - should have assignemnt everywhere
    df["DIST_index"].loc[i] = i
    if df.loc[i,"DIST"]==0:
        district_list = [0]
        index_to_len_dict[i] = 0
    else:
        district_list = list(set(df.loc[i,"DIST"].split(", ")))
        index_to_len_dict[i] = len(district_list)
    index_to_set_dict[i] = district_list

    if len(district_list)==1:
        district_item = district_list[0]
        index_to_set_dict[i] = district_item
df["DIST_set"] = df["DIST_index"].map(index_to_set_dict)

df["DIST_len_assignment"] = df["DIST_index"].map(index_to_len_dict)
prec_to_dist_dict = pd.Series(df["DIST_set"].values, index = df["UNIQUE_ID"])

elapsed_time = time.process_time() - t
print("time to run fxn: ", elapsed_time)
    
#    return df, prec_to_dist_dict

In [None]:
df["DIST_set"].value_counts()

In [None]:
df["SU_DIST"] = df["UNIQUE_ID"].map(prec_to_dist_dict)

In [None]:
df[df["DIST_len_assignment"]>1]

In [None]:
df["DIST_index"].value_counts()

In [None]:
df_splits = district_splits_comb("SU", prec_to_dist_dict, df, su_shp, "UNIQUE_ID", "DISTRICT", list(merge.columns[merge.columns.str.contains("GSU")]), "SU_DIST", fill_level = 2)
df_elections_gdf = df_splits[0]
df_lost_votes_gdf = df_splits[1]

In [None]:
df_cleaned = clean_na_dist_assignments(df_elections_gdf, su_shp, "UNIQUE_ID", "SU_DIST", "DISTRICT")

In [None]:
df_cleaned["SLDU_DIST"] = df_cleaned["SU_DIST"]
df_cleaned = df_cleaned.drop("SU_DIST", axis=1)
assert(not(district_assignment_errors(df_cleaned, "SLDU_DIST"))), "Bad SLDU_DIST assignment"
print("CONFIRMED: Actual su votes match SLDU_DIST assignment") 

In [None]:
df_cleaned.drop("SU_DIST", axis=1)

In [None]:
pd.Series(df["SU_DIST"][df["DIST_len_assignment"]>1].values, index = df["UNIQUE_ID"][df["DIST_len_assignment"]>1]).to_dict()

In [None]:
prec_to_dist_dict.to_dict()

In [None]:
len(su_pber_splits_gdf[su_pber_splits_gdf["DIST"]!=0])

In [None]:
def get_contest_dist_dict(df, contest):
    if contest =="GCON":
        col_to_dist_dict = pd.Series(df.columns[df.columns.str.startswith(contest)].str.slice(start=4,stop=6).values, index=df.columns[df.columns.str.startswith(contest)]).to_dict()
    elif contest =="GSL":
        col_to_dist_dict = pd.Series(df.columns[df.columns.str.startswith(contest)].str.slice(start=3,stop=6).values, index=df.columns[df.columns.str.startswith(contest)]).to_dict()
    elif contest =="GSU":
        col_to_dist_dict = pd.Series(df.columns[df.columns.str.startswith(contest)].str.slice(start=3,stop=5).values, index=df.columns[df.columns.str.startswith(contest)]).to_dict()
    
    return col_to_dist_dict


def get_prec_to_dist_list_dict(df, contest):
    col_dist_dict = get_contest_dist_dict(df, contest)
    df["DIST"] = 0
    #Instead of iterating over columns, iterate over districts?
    for col in df.columns[df.columns.str.startswith(contest)]:
        df.loc[(df[col]>0)&(df["DIST"]==0), "DIST"] = col_dist_dict.get(col)

        df.loc[(df[col]>0)&(df["DIST"]!=0), "DIST"] = df["DIST"][(df[col]>0)]+", "+col_dist_dict.get(col)
    
    dist_list_dict = pd.Series(df["DIST"].values, index=df["UNIQUE_ID"])
    
    return dist_list_dict



def get_df_with_split_prec_indicator(df, contest):
    t = time.process_time()
    
    df["DIST"] = df["UNIQUE_ID"].map(get_prec_to_dist_list_dict(df, contest))
    
    df["DIST_index"]=0
    df["DIST_len_assignment"]=0
    index_to_set_dict = {}
    index_to_len_dict = {}

    #8933=Len of gdf outside of ZZZ precs//keeping as much as possible out of for loop for speed
    for i in range(0,len(df[df["DIST"]!=0])-1):
        df["DIST_index"].loc[i] = i
        if df.loc[i,"DIST"]==0:
            district_list = [0]
            index_to_len_dict[i] = 0
        else:
            district_list = list(set(df.loc[i,"DIST"].split(", ")))
            index_to_len_dict[i] = len(district_list)
        index_to_set_dict[i] = district_list
        
        if len(district_list)==1:
            district_item = district_list[0]
            index_to_set_dict[i] = district_item
    df["DIST_set"] = df["DIST_index"].map(index_to_set_dict)
   
    df["DIST_len_assignment"] = df["DIST_index"].map(index_to_len_dict)
    prec_to_dist_dict = pd.Series(df["DIST_set"].values, index = df["UNIQUE_ID"])
    
    elapsed_time = time.process_time() - t
    print("time to run fxn: ", elapsed_time)
    
    return df, prec_to_dist_dict

In [None]:
#fxns of interest for this issue:
def get_df_with_split_prec_indicator(df, contest):
    t = time.process_time()
    
    df["DIST"] = df["UNIQUE_ID"].map(get_prec_to_dist_list_dict(df, contest))
    
    df["DIST_index"]=0
    df["DIST_len_assignment"]=0
    index_to_set_dict = {}
    index_to_len_dict = {}

    #8933=Len of gdf outside of ZZZ precs//keeping as much as possible out of for loop for speed
    for i in range(0,len(df[df["DIST"]!=0])-1):
        df["DIST_index"].loc[i] = i
        if df.loc[i,"DIST"]==0:
            district_list = [0]
            index_to_len_dict[i] = 0
        else:
            district_list = list(set(df.loc[i,"DIST"].split(", ")))
            index_to_len_dict[i] = len(district_list)
        index_to_set_dict[i] = district_list
        
        if len(district_list)==1:
            district_item = district_list[0]
            index_to_set_dict[i] = district_item
    df["DIST_set"] = df["DIST_index"].map(index_to_set_dict)
   
    df["DIST_len_assignment"] = df["DIST_index"].map(index_to_len_dict)
    prec_to_dist_dict = pd.Series(df["DIST_set"].values, index = df["UNIQUE_ID"])
    
    elapsed_time = time.process_time() - t
    print("time to run fxn: ", elapsed_time)
    
    return df, prec_to_dist_dict


def generate_differences_df(df_compare_against, df_compare_to, unique_ID_col, races_list, drop_empty = False):
    
    df_compare_against = df_compare_against[[unique_ID_col]+races_list]
    df_compare_to = df_compare_to[[unique_ID_col]+races_list]
 
    grouped_compare_against = df_compare_against.groupby(unique_ID_col).sum()
    grouped_compare_to = df_compare_to.groupby(unique_ID_col).sum()
    
    grouped_compare_against.reset_index(inplace = True, drop = False)
    grouped_compare_to.reset_index(inplace = True, drop = False)
    diffs = grouped_compare_against.set_index(unique_ID_col).subtract(grouped_compare_to.set_index(unique_ID_col))
    
    diffs["Tot_Votes"] = diffs[races_list].sum(axis=1)
    
    if drop_empty:
        diffs = diffs.loc[~(diffs==0).all(axis=1)]
        diffs = diffs.loc[:, (diffs != 0).any(axis=0)]
    return diffs


def district_splits_comb(level, splits_dict, elections_gdf, district_gdf, unique_ID_col, district_gdf_ID, races_list, elections_gdf_dist_ID, fill_level = 2):
    '''
    Function to split precincts across districts that splits a precinct across the entire district map.
    Previous iterations of this code only split precincts by the districts in which votes were recorded.
    In some instances, that led to holes in the map, due to districts where no votes were recorded in a precinct, but where an intersection occurred.
    '''
    t = time.process_time()
    # Intersect the elections gdf with the district gdf
    need_splits = elections_gdf[elections_gdf[unique_ID_col].isin(list(splits_dict.keys()))]
    others = elections_gdf[~elections_gdf[unique_ID_col].isin(list(splits_dict.keys()))]
    
    pre_splits_copy = need_splits.copy(deep = True)
    
    test_join = gp.overlay(need_splits, district_gdf, how = "intersection")
    
    # Assign a district column, using the district shapefile
    test_join[elections_gdf_dist_ID] = test_join[district_gdf_ID]
    
    # Filter the intersection down to the precinct, district pairs we need
    clean_votes = test_join.copy(deep = True)
    
    clean_votes[unique_ID_col+"_new"] = clean_votes[unique_ID_col]
    
    # Remove the others and hold on to these to be merged later
    for index, row in clean_votes.iterrows():
        clean_votes.at[index, unique_ID_col+"_new"] = row[unique_ID_col]+"-("+level + "-" + row[district_gdf_ID].zfill(fill_level) + ")" 
        for column in test_join:
            if column in races_list and row[elections_gdf_dist_ID].zfill(fill_level) not in column:
                clean_votes.at[index, column] = 0 
        
    lost_votes_df = generate_differences_df(pre_splits_copy, clean_votes, unique_ID_col, races_list, True)
    
    clean_votes.drop(unique_ID_col, axis = 1, inplace = True)
    clean_votes.rename(columns = {unique_ID_col+"_new":unique_ID_col}, inplace = True)
    clean_votes = clean_votes[list(others.columns)]
        
    elections_gdf = gp.GeoDataFrame(pd.concat([clean_votes, others]), crs = elections_gdf.crs)
    elections_gdf.reset_index(drop=True,inplace=True)
    
    elapsed_time = time.process_time() - t
    print("time to run fxn: ", elapsed_time)
    
    return elections_gdf, lost_votes_df