# OH 2022 PBER Cleaning

## On self overlay
The RDH received precinct shapefiles for every county in Ohio from the OpenElections team. The RDH standardized the shapefiles, made modifications where needed, and combined all of them into one statewide file to be joined with precinct-level election results. 

With the precinct boundary and election results geodataframe, the RDH runs maup.assign() to dissaggregate the election results down to the Census Block level from the precincts to improve the utility to map drawers. In some precincts within the original Ohio shapefile, the geometry contained an additional layer without its' own geometry. This additional layer would not be visible at a first glance, but causes a duplicate axis error when maup.assign() is run as the blocks then have two precinct layers they could be assigned to, when each should have one and only one assignment. 

The following code cleans the overlapping geometries so that maup can be run and the shapefile is better set for any future analyses.

This code was used to develop and check the technique, but file production was completed in "pber_only_v1.ipynb" in order to keep the precinct processing together

**Last updated 9/8/2023**

In [1]:
import os
import maup
import numpy as np
import pandas as pd
import geopandas as gp
from shapely.geometry import MultiPolygon, Polygon
from shapely.ops import unary_union


#Step 0: Load in relevant gdfs - 2022 PBER drafts, and 2020 blocks - + fxns
oh_blocks = gp.read_file("./oh_pl2020_b/oh_pl2020_p1_b.shp")[['GEOID20','P0010001', 'geometry']]
pber_all = gp.read_file("./oh_2022_gen_prec_shp_wip/oh_2022_gen_prec_no_splits.shp") 
pber_st = gp.read_file("./oh_2022_gen_prec_shp_wip/oh_2022_gen_prec_st.shp")
pber_cong = gp.read_file("./oh_2022_gen_prec_shp_wip/oh_2022_gen_prec_cong.shp")
pber_sl = gp.read_file("./oh_2022_gen_prec_shp_wip/oh_2022_gen_prec_sldl.shp")
assert set(pber_sl[pber_sl.columns[pber_sl.columns.str.startswith("G")]][pber_sl["UNIQUE_ID"]=='SUMMIT-AGT-(SL-031)'].sum()[pber_sl[
    pber_sl.columns[pber_sl.columns.str.startswith("G")]][pber_sl["UNIQUE_ID"]=='SUMMIT-AGT-(SL-031)'].sum()!=0]) == set()
pber_sl = pber_sl[pber_sl["UNIQUE_ID"]!='SUMMIT-AGT-(SL-031)']
pber_su = gp.read_file("./oh_2022_gen_prec_shp_wip/oh_2022_gen_prec_sldu.shp")




In [2]:
def fix_self_intersection(pber, oh_blocks):
    #Step 1: Self intersect precinct gdf
    self_intersect_gdf = gp.overlay(pber, pber, how="intersection", keep_geom_type=False)


    #Step 2: Filter out tiny areas from intersection as leaving them would cause future overlay issues
    self_intersect_gdf = self_intersect_gdf[self_intersect_gdf.area>0.00001]


    #Step 3: Filter into IDs that are equal and not equal - equal is the same as original prec gdf, not equal are tiny pieces that overlap as their own shapes
    id1_not_id2 = self_intersect_gdf[self_intersect_gdf["UNIQUE_ID_1"]!=self_intersect_gdf["UNIQUE_ID_2"]]
    id1_is_id2 = self_intersect_gdf[self_intersect_gdf["UNIQUE_ID_1"]==self_intersect_gdf["UNIQUE_ID_2"]]


    #Step 4: Overlay tiny pieces with blocks to see which are actually problematic for maup
    assert oh_blocks.crs == id1_not_id2.crs
    block_tiny_id_overlay = gp.overlay(oh_blocks, id1_not_id2, how="intersection", keep_geom_type=False)


    #Step 5: Grab precinct names for area > 0.00001 from block tiny piece intersection as only area less than this will not cause problems
    set_prec_to_change = set(block_tiny_id_overlay["UNIQUE_ID_1"][block_tiny_id_overlay.area>0.00001])


    #Step 6: Filter id1_not_id2 to just include set_prec_to_change
    gdf_to_pull_out = id1_not_id2[id1_not_id2["UNIQUE_ID_1"].isin(set_prec_to_change)]


    #Step 7: Run symmetric difference of prec_gdf with gdf_to_pull_out
    assert pber.crs == gdf_to_pull_out.crs
    try:
        overlaps_removed = gp.overlay(pber, gdf_to_pull_out, how="symmetric_difference", keep_geom_type=False)
    except:
        gdf_to_pull_out_mod = remove_bad_geom(gdf_to_pull_out)
        overlaps_removed = gp.overlay(pber, gdf_to_pull_out_mod, how="symmetric_difference", keep_geom_type=False)

    
    #Step 8: Add missing precinct back in and cut out bad ones
    pber_updated = gp.GeoDataFrame(pd.concat([overlaps_removed, pber[pber["UNIQUE_ID"]=='ERIE-PRECINCT CAST VILL'],pber[pber["UNIQUE_ID"]=='WOOD-PERRYSBURG J']], ignore_index=True), crs = overlaps_removed.crs)
    pber_updated = pber_updated[~pber_updated["UNIQUE_ID"].isna()]
    pber_updated = remove_bad_geom(pber_updated)
    #try:
     #   assert pber_updated["UNIQUE_ID"].nunique() == pber["UNIQUE_ID"].nunique()
    #except:
    pber_updated = pber_updated.dissolve(by="UNIQUE_ID").reset_index()

    return pber_updated


def remove_bad_geom(gdf):
    #Step 9: Remove bad geom
    geom_as_string = gdf["geometry"].astype(str)
    bad_geom = geom_as_string[geom_as_string.apply(lambda x: x.startswith("GEOMETRYCOLLECTION"))]
    gdf_updated = gdf[~gdf.index.isin(set(bad_geom.index))]
    
    return gdf_updated


def fix_buffer(gdf):
    """
    return (GeoDataFrame) with the 'buffer(0) trick' applied
    :gdf: (GeoDataFrame) object
    Can be useful when trying to mitigate 'self-intersection' issues
    """
    buffered = gdf.buffer(0)
    gdf.drop(columns=["geometry"])
    # gdf['geometry'] = gdf.apply(lambda x: x.geometry.buffer(0), axis=1)
    gdf["geometry"] = buffered
    
    return gdf


def export_import_test(pber, pber_updated, oh_blocks):
    #Step 10: Test Results
    assert pber_updated["UNIQUE_ID"].nunique() == pber["UNIQUE_ID"].nunique()
    pber_updated.to_file("./oh_2022_gen_prec_shp_wip/test_export_import.shp")
    pber_test_import = gp.read_file("./oh_2022_gen_prec_shp_wip/test_export_import.shp")
    maup_test_import = maup.assign(fix_buffer(oh_blocks),fix_buffer(pber_test_import))
    
    return maup_test_import


In [3]:
#Step 11: Apply
#NO SPLITS
pber_all_updated = fix_self_intersection(pber_all, oh_blocks)
pber_all_format = pber_all_updated[['UNIQUE_ID', 'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE', 'VTDST22']+
                                     list(pber_all_updated.columns[(pber_all_updated.columns.str.slice(start=-2, stop=-1)!="_")&
                                                                    (pber_all_updated.columns.str.startswith("G"))])+["geometry"]]
export_import_test(pber_all, pber_all_format, oh_blocks)


#STATEWIDE
pber_st_updated = fix_self_intersection(pber_st, oh_blocks)
pber_st_format = pber_st_updated[['UNIQUE_ID', 'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE', 'VTDST22']+
                                     list(pber_st_updated.columns[(pber_st_updated.columns.str.slice(start=-2, stop=-1)!="_")&
                                                                    (pber_st_updated.columns.str.startswith("G"))])+["geometry"]]
export_import_test(pber_st, pber_st_format, oh_blocks)


#CONGRESSIONAL
pber_cong_updated = fix_self_intersection(pber_cong, oh_blocks)
#pber_cong_updated["CON_DIST"] = pber_cong_updated["CONG_DIST"]
pber_cong_format = pber_cong_updated[['UNIQUE_ID', 'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE', 'VTDST22', 'CONG_DIST']+
                                     list(pber_cong_updated.columns[(pber_cong_updated.columns.str.slice(start=-2, stop=-1)!="_")&
                                                                    (pber_cong_updated.columns.str.startswith("G"))])+["geometry"]]
export_import_test(pber_cong, pber_cong_format, oh_blocks)


#SLDL
pber_sl_updated = fix_self_intersection(pber_sl, oh_blocks)
pber_sl_format = pber_sl_updated[['UNIQUE_ID', 'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE','VTDST22', 'SLDL_DIST']+
                list(pber_sl_updated.columns[(pber_sl_updated.columns.str.slice(start=-2, stop=-1)!="_")&
                                             (pber_sl_updated.columns.str.startswith("G"))])+["geometry"]]
export_import_test(pber_sl, pber_sl_format, oh_blocks)


#SLDU
pber_su_updated = fix_self_intersection(pber_su, oh_blocks)
pber_su_format = pber_su_updated[['UNIQUE_ID', 'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE', 'VTDST22', 'SLDU_DIST']+
                                 list(pber_su_updated.columns[(pber_su_updated.columns.str.slice(start=-2, stop=-1)!="_")&
                                                              (pber_su_updated.columns.str.startswith("G"))])+["geometry"]]
export_import_test(pber_su, pber_su_format, oh_blocks)


  self_intersect_gdf = self_intersect_gdf[self_intersect_gdf.area>0.00001]

  set_prec_to_change = set(block_tiny_id_overlay["UNIQUE_ID_1"][block_tiny_id_overlay.area>0.00001])

  geometries = geometries[geometries.area > area_cutoff]

  return assign_to_max(intersections(sources, targets, area_cutoff=0).area)

  self_intersect_gdf = self_intersect_gdf[self_intersect_gdf.area>0.00001]

  set_prec_to_change = set(block_tiny_id_overlay["UNIQUE_ID_1"][block_tiny_id_overlay.area>0.00001])

  geometries = geometries[geometries.area > area_cutoff]

  return assign_to_max(intersections(sources, targets, area_cutoff=0).area)

  self_intersect_gdf = self_intersect_gdf[self_intersect_gdf.area>0.00001]

  set_prec_to_change = set(block_tiny_id_overlay["UNIQUE_ID_1"][block_tiny_id_overlay.area>0.00001])
TopologyException: found non-noded intersection between LINESTRING (-82.69 41.4916, -82.69 41.4916) and LINESTRING (-82.6899 41.4916, -82.69 41.4916) at -82.689968319949315 41.491598693991584

  g

0         3948.0
1         3905.0
2         4339.0
3         4354.0
4         4403.0
           ...  
276423    1167.0
276424    1160.0
276425    1164.0
276426    1166.0
276427    1154.0
Length: 276428, dtype: float64

## Check vote totals

In [4]:
def check_unique_id_unique(merged_gdf):
    assert merged_gdf["UNIQUE_ID"].isna().any()==False
    assert merged_gdf["UNIQUE_ID"].nunique()==merged_gdf.shape[0]
    return "unique_id is unique"


#State, County, Precinct total vote checks adapted from pdv checks: https://github.com/nonpartisan-redistricting-datahub/pdv-resources/blob/main/pdv_functions.py
def statewide_totals_check(partner_df, partner_name, source_df, source_name, column_list):
    """Compares the totals of two election result dataframes at the statewide total level

    Args:
      partner_df: DataFrame of election results we are comparing against
      source_df: DataFrame of election results we are comparing to
      column_list: List of races that there are votes for
 
    Returns:
      difference list
    """
    print("***Statewide Totals Check***")
    diff_races=[]
    for race in column_list:
        if (partner_df[race].sum()- source_df[race].sum() != 0):
            if race not in diff_races:
                diff_races.append(race)
            print(race+" has a difference of "+str(partner_df[race].sum()-source_df[race].sum())+" votes")
            print("\t"+ partner_name + ": "+str(partner_df[race].sum())+" votes")
            print("\t"+ source_name +": "+str(source_df[race].sum())+" votes")
        #else:
            #print(race + " is equal", "\t both dataframes " + str(partner_df[race].sum()))
    
    if (len(diff_races)==0):
        print("All contests match statewide!")
    elif (len(diff_races)>0):
        print("Contests with differences: ")
    
    return diff_races


def county_totals_check(partner_df, partner_name, source_df, source_name, column_list,county_col,full_print=False):
    """Compares the totals of two election result dataframes at the county level

    Args:
      partner_df: DataFrame of election results we are comparing against
      partner_name: String of what to call the partner in the print statement
      source_df: DataFrame of election results we are comparing to
      source_name: String of what to call the source in the print statement
      column_list: List of races that there are votes for
      county_col: String of the column name that contains county information
      full_print: Boolean specifying whether to print out everything, including counties w/ similarities

    Returns:
      difference list
    """
    
    print("\n***Countywide Totals Check***")
    diff_counties=[]
    for race in column_list:
        diff = partner_df.groupby([county_col]).sum()[race]-source_df.groupby([county_col]).sum()[race]
        for val in diff[diff != 0].index.values.tolist():
            if val not in diff_counties:
                diff_counties.append(val)
        if len(diff[diff != 0]!=0):   
            print(race + " contains differences in these counties:")
            for val in diff[diff != 0].index.values.tolist():
                county_differences = diff[diff != 0]
                print("\t"+val+" has a difference of "+str(county_differences[val])+" votes")
                print("\t\t"+ partner_name + ": "+str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
                print("\t\t"+ source_name +": "+str(source_df.groupby([county_col]).sum().loc[val,race])+" votes")
            if (full_print):
                for val in diff[diff == 0].index.values.tolist():
                    county_similarities = diff[diff == 0]
                    print("\t"+val + ": "+ str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
        #else:
            #print(race + " is equal across all counties")
            #if (full_print):
               # for val in diff[diff == 0].index.values.tolist():
                 #   county_similarities = diff[diff == 0]
                    #print("\t"+val + ": "+ str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
    if (len(diff_counties)==0):
        print("All contests in all counties match!")
    elif (len(diff_counties)>0):
        print("Counties with differences: ")
        
    return diff_counties
        
    
def precinct_votes_check(merged_df,column_list,vest_on_left,name_col,print_level=0):
    """Checks a merged dataframe with two election results at the precinct level

    Args:
      merged_df: DataFrame with one set of election results joined to another
      column_list: List of races that there are votes for
      vest_on_left: Boolean specifying whether VEST data is on the left side of merged_df
      name_col: String of the column name to refer to precincts when a difference occurs
      print_level: Integer that specifies how large the vote difference in a precinct must be to be printed

    Returns:
      list of differences
    """
    print("\n***Precinct Totals Check***")
    merged_df = merged_df.sort_values(by=[name_col],inplace=False)
    matching_rows = 0
    different_rows = 0
    diff_list=[]
    diff_values = []
    max_diff = 0
    for index,row in merged_df.iterrows():
        same = True
        for i in column_list:
            left_data = i + "_x"
            right_data = i + "_y"
            
            if ((row[left_data] is None) or (row[right_data] is None) or (np.isnan(row[right_data])or(np.isnan(row[left_data])))):
                print("FIX NaN value at: ", row[name_col])
            
            diff = abs(row[left_data]-row[right_data])
            if (diff>0):
                same = False
                diff_values.append(abs(diff))
                if (diff>max_diff):
                    max_diff = diff
            if(diff>print_level):
                if (vest_on_left):
                    print(i, "{:.>72}".format(row[name_col]), "(V)","{:.>5}".format(int(row[left_data]))," (S){:.>5}".format(int(row[right_data])),"(D):{:>5}".format(int(row[left_data]-row[right_data])))                           
                else:
                    print(i, "{:.>72}".format(row[name_col]), "(S)","{:.>5}".format(int(row[left_data]))," (V){:.>5}".format(int(row[right_data])),"(D):{:>5}".format(int(row[left_data]-row[right_data])))
        if(same != True):
            different_rows +=1
            diff_list.append(row[name_col])
        else:
            matching_rows +=1

    print("\nThere are ", len(merged_df.index)," total rows")
    
    if(len(diff_values)!=0):
        print(matching_rows," of these rows are the same")
        print("\nAll precincts containing differences:")
        print("The average difference is: ", str(sum(diff_values)/len(diff_values)))
        print("\nThe max difference between any one shared column in a row is: ", max_diff)
        count_big_diff = len([i for i in diff_values if i > 10])
        print("There are ", str(count_big_diff), "precinct results with a difference greater than 10")
    else:
        print(matching_rows," of these rows are the same")
    
    diff_list.sort()
    
    return diff_list


def run_all_checks(partner_df, partner_name, source_df, source_name, county_col,full_print=False, prec_check=True):
    column_list = list(source_df.columns[source_df.columns.str.startswith("G")])
    check_unique_id_unique(source_df)
    #Running inner join because of expected nan value for ZZZ precincts
    merged_df = pd.merge(source_df, partner_df, on = ["UNIQUE_ID"], how = "inner", indicator=True)
    vest_on_left = False
    name_col = "UNIQUE_ID"
    #All matches statewide and county levels
    statewide_totals_check(partner_df, partner_name, source_df, source_name, column_list)
    #County total check not working here but also not needed if others work...
    #county_totals_check(partner_df, partner_name, source_df, source_name, column_list,county_col,full_print=False)
    if prec_check ==True:
        precinct_votes_check(merged_df,column_list,vest_on_left,name_col,print_level=0)
    
    

partner_name = "original ER 22"
source_name = "PBER 22"
county_col = "COUNTYNM"
#---


In [6]:
pber_all_format = pber_all_updated[['UNIQUE_ID', 'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE', 'VTDST22']+
                                     list(pber_all_updated.columns[(pber_all_updated.columns.str.slice(start=-2, stop=-1)!="_")&
                                                                    (pber_all_updated.columns.str.startswith("G"))])+["geometry"]]

pber_st_format = pber_st_updated[['UNIQUE_ID', 'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE', 'VTDST22']+
                                     list(pber_st_updated.columns[(pber_st_updated.columns.str.slice(start=-2, stop=-1)!="_")&
                                                                    (pber_st_updated.columns.str.startswith("G"))])+["geometry"]]

pber_cong_updated["CON_DIST"] = pber_cong_updated["CONG_DIST"]
pber_cong_format = pber_cong_updated[['UNIQUE_ID', 'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE', 'VTDST22', 'CON_DIST']+
                                     list(pber_cong_updated.columns[(pber_cong_updated.columns.str.slice(start=-2, stop=-1)!="_")&
                                                                    (pber_cong_updated.columns.str.startswith("G"))])+["geometry"]]

pber_sl_format = pber_sl_updated[['UNIQUE_ID', 'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE','VTDST22', 'SLDL_DIST']+
                list(pber_sl_updated.columns[(pber_sl_updated.columns.str.slice(start=-2, stop=-1)!="_")&
                                             (pber_sl_updated.columns.str.startswith("G"))])+["geometry"]]

pber_su_format = pber_su_updated[['UNIQUE_ID', 'COUNTYFP', 'COUNTYNM', 'PRECINCT', 'PRECCODE', 'VTDST22', 'SLDU_DIST']+
                                 list(pber_su_updated.columns[(pber_su_updated.columns.str.slice(start=-2, stop=-1)!="_")&
                                                              (pber_su_updated.columns.str.startswith("G"))])+["geometry"]]

In [7]:
print("\nchecking statewide gdf:")
run_all_checks(pber_all, partner_name, pber_all_format, "merge gdf",county_col,full_print=False, prec_check=True)
print("\nchecking statewide gdf:")
run_all_checks(pber_st, partner_name, pber_st_format, "merge gdf", county_col,full_print=False, prec_check=True)
print("\nchecking CONG w splits gdf:")
run_all_checks(pber_cong, partner_name, pber_cong_format.reset_index(), "cong split",county_col,full_print=False, prec_check=True)
print("\nchecking SLDL w splits gdf:")
run_all_checks(pber_sl, partner_name, pber_sl_format, "sl split",county_col,full_print=False, prec_check=True)
print("\nchecking SLDU w splits gdf")
run_all_checks(pber_su, partner_name, pber_su_format, "su split",county_col,full_print=False, prec_check=True)


checking statewide gdf:
***Statewide Totals Check***
All contests match statewide!

***Precinct Totals Check***

There are  8941  total rows
8941  of these rows are the same

checking statewide gdf:
***Statewide Totals Check***
All contests match statewide!

***Precinct Totals Check***

There are  8941  total rows
8941  of these rows are the same

checking CONG w splits gdf:
***Statewide Totals Check***
All contests match statewide!

***Precinct Totals Check***

There are  8977  total rows
8977  of these rows are the same

checking SLDL w splits gdf:
***Statewide Totals Check***
All contests match statewide!

***Precinct Totals Check***

There are  9068  total rows
9068  of these rows are the same

checking SLDU w splits gdf
***Statewide Totals Check***
All contests match statewide!

***Precinct Totals Check***

There are  8970  total rows
8970  of these rows are the same


## Export cleaned gdfs

In [8]:
pber_all_format.to_file("./oh_2022_gen_prec_shp/oh_2022_gen_prec_no_splits.shp")
pber_st_format.to_file("./oh_2022_gen_prec_shp/oh_2022_gen_prec_st.shp")
pber_cong_format.to_file("./oh_2022_gen_prec_shp/oh_2022_gen_prec_cong.shp")
pber_su_format.to_file("./oh_2022_gen_prec_shp/oh_2022_gen_prec_sldu.shp")
pber_sl_format.to_file("./oh_2022_gen_prec_shp/oh_2022_gen_prec_sldl.shp")