Sources:
gen file: https://redistrictingdatahub.org/dataset/north-carolina-2020-general-election-precinct-and-election-results/
prim file: https://redistrictingdatahub.org/dataset/north-carolina-2020-primary-election-precinct-and-election-results/


The issue at hand:
</br> Primary file does not include all splits for districts that the general file does. 
This was not accounted for in the precinct(primary?) file because for some congressional districts no election occurred. 
</br> For instance: 
a precinct contains CD1 and CD2 - 
In the general file this is split for votes. 
In the primary file CD1 did not receive any votes, so no split was added to the geography and only votes appear for CD2.
</br> However, in the disaggregated file, it means that votes for CD2 are currently getting assigned to blocks within the entire precinct, including in CD1's geography.
</br>As a result, want to intervene and assign all blocks in CD1 0 to not get any votes and have all votes allocated only in the section of the precinct that contains CD2.

</br> To approach this issue, one option is to identify where this occurs, which blocks based on the general file, disagg general file normally,
</br> then, for the primary file assign those blocks 0 population, disaggregate, then join the P and G gdfs and only keep the pop count from the G gdf.

Another option/question is whether using a maup assign from the general file on the primary file could fix the issue..

We are specifically interested in the precincts that contain splits in the general file not in the prim file, in terms of df slicing


In [1]:
import pandas as pd
import numpy as np
import geopandas as gp
import os
import maup
from op_verification.reference_data import *
import boto3
import io
pd.set_option("display.max_columns", None)
import warnings
warnings.filterwarnings('ignore', 'GeoSeries.isna', UserWarning)

maup.progress.enabled = True

# Load in NC file

## block file set up

In [2]:
block_cols = ['FIPS', 'VTD', 'TRACT', 'BLOCK', 'LOGRECNO', 'P0010001', 'COUNTYFP20', 'COUSUBFP20', 'VTDST20', 'TRACTCE20', 'BLOCKCE20',
       'GEOID20', 'NAME20', 'MTFCC20', 'UR20', 'UACE20', 'UATYPE20', 'FUNCSTAT20', 'ALAND20', 'AWATER20', 'INTPTLAT20', 'INTPTLON20',
       'COUNTY_NME', 'MCD_NUMBER', 'MCD_NAME', 'MCD_TYP_NM', 'MCD_TYP_CD','VTD_NAME', 'BLKGRP', 'geometry']

census_block_gdf = gp.read_file('nc_20_disag/nc_pl2020_b/nc_pl2020_b.shp')[['GEOID20', 'COUNTYFP20', 'P0010001', 'P0020001', 'P0040001', 'P0050001','P0050003', 'geometry']]

def modify_P0040001_P0050003(block_gdf):
    block_gdf['P0040001-P0050003'] = block_gdf['P0040001'] - block_gdf['P0050003']
    print("Error raised iff the number of incarcerated blocks with non-zero population does NOT equal the number of modified vap blocks.")
    assert (len(block_gdf[block_gdf['P0050003']!=0]) - len(block_gdf[block_gdf['P0040001']!=block_gdf['P0040001-P0050003']])) == 0
    block_gdf.loc[block_gdf['P0040001-P0050003']<0, 'P0040001-P0050003'] = 0

    return block_gdf


block_gdf = modify_P0040001_P0050003(census_block_gdf)

neg_pop_blocks = list(block_gdf['GEOID20'][block_gdf['P0040001-P0050003']<0])
assert neg_pop_blocks == []

Error raised iff the number of incarcerated blocks with non-zero population does NOT equal the number of modified vap blocks.


## precinct file set up

In [3]:
prec_prim_cong_gdf = gp.read_file('./nc_20_disag/nc_prim_20_prec_mod/nc_prim_20_cong_prec.shp')
prec_gen_cong_gdf = gp.read_file('./nc_20_disag/nc_gen_20_prec/nc_gen_20_cong_prec.shp')
prec_prim_sldl_gdf = gp.read_file('./nc_20_disag/nc_prim_20_prec_mod/nc_prim_20_sldl_prec.shp')
prec_gen_sldl_gdf = gp.read_file('./nc_20_disag/nc_gen_20_prec/nc_gen_20_sldl_prec.shp')
prec_prim_sldu_gdf = gp.read_file('./nc_20_disag/nc_prim_20_prec_mod/nc_prim_20_st_sldu_prec.shp')
prec_gen_sldu_gdf = gp.read_file('./nc_20_disag/nc_gen_20_prec/nc_gen_20_st_sldu_prec.shp')

print(prec_prim_cong_gdf.shape)
print(prec_gen_cong_gdf.shape)
print(prec_prim_sldl_gdf.shape)
print(prec_gen_sldl_gdf.shape)
print(prec_prim_sldu_gdf.shape)
print(prec_gen_sldu_gdf.shape)

(2675, 59)
(2681, 37)
(2675, 106)
(2708, 251)
(2666, 141)
(2662, 163)


## maup assign and set up

In [4]:
#Functions for running maup
def check_valid_rows(block_gdf, precinct_gdf):
    prec_geom = precinct_gdf.geometry
    valid_rows = precinct_gdf[~(prec_geom.isna() | prec_geom.is_empty)]
    print('valid precinct rows: ', valid_rows.shape)

    block_geom = block_gdf.geometry
    block_valid_rows = block_gdf[~(block_geom.isna() | block_geom.is_empty)]
    print('valid block rows: ', block_valid_rows.shape)
    

def fix_buffer(gdf):
    """
    return (GeoDataFrame) with the 'bufer(0) trick' applied
    :gdf: (GeoDataFrame) object
    Can be useful when trying to mitigate 'self-intersection' issues
    """
    buffered = gdf.buffer(0)
    gdf.drop(columns=["geometry"])
    # gdf['geometry'] = gdf.apply(lambda x: x.geometry.buffer(0), axis=1)
    gdf["geometry"] = buffered
    return gdf


def maup_assignment_series(block_gdf, precinct_gdf):
    block_gdf = block_gdf.to_crs(precinct_gdf.crs)
    print('block_gdf shape: ', block_gdf.shape, '\n precinct_gdf shape: ', precinct_gdf.shape)

    block_gdf["maup_assignment"] = maup.assign(
        fix_buffer(block_gdf), fix_buffer(precinct_gdf)
    )
    print(block_gdf[block_gdf['maup_assignment'].isna()].shape[0], " null assignments in maup series")
    return block_gdf


def maup_pre_vote_setup(block_gdf, precinct_gdf, PRECID_block, DISTID_block, DISTID_prec):
    block_gdf = block_gdf.to_crs(precinct_gdf.crs)
    bgdf = maup_assignment_series(block_gdf, precinct_gdf)
    assert bgdf['maup_assignment'].isna().any() == False
    bgdf[PRECID_block] = bgdf['maup_assignment'].map(lambda idx: str(precinct_gdf.loc[idx]["UNIQUE_ID"]))
    bgdf[DISTID_block] = bgdf["maup_assignment"].map(lambda idx: str(precinct_gdf.loc[idx][DISTID_prec]))
    
    return bgdf

    
def maup_assign_labels(block_gdf, precinct_gdf, precinctid_input, distid_input):
    #Assign precinct IDs to block file using assign series
    block_gdf[precinctid_input] = block_gdf["maup_assignment"].map(
        lambda idx: str(precinct_gdf.loc[idx]["UNIQUE_ID"])
    )
    if "CONG_DIST" in precinct_gdf.columns:
        block_gdf[distid_input] = block_gdf["maup_assignment"].map(
            lambda idx: str(precinct_gdf.loc[idx]["CONG_DIST"])
        )
    if "SLDL_DIST" in precinct_gdf.columns:
        block_gdf[distid_input] = block_gdf["maup_assignment"].map(
            lambda idx: str(precinct_gdf.loc[idx]["SLDL_DIST"])
        )
    if "SLDU_DIST" in precinct_gdf.columns:
        block_gdf[distid_input] = block_gdf["maup_assignment"].map(
            lambda idx: str(precinct_gdf.loc[idx]["SLDU_DIST"])
        )
    print("Label assignment complete")

    return block_gdf


def election_cols(gdf):
    election_cols = list(gdf.columns[gdf.columns.str.startswith('P20')|gdf.columns.str.startswith('PS')|gdf.columns.str.startswith('PC')|
                     gdf.columns.str.startswith('G20')|gdf.columns.str.startswith('GS')|gdf.columns.str.startswith('GC')])
    return election_cols  


def assign_votes(variables, election_columns, precinct_gdf, block_gdf):
    precinct_gdf[variables] = block_gdf[variables].groupby(block_gdf["maup_assignment"]).sum()
    print(variables, ' added to precinct_gdf based on maup assignment')
    bl_to_prec_weights = block_gdf[variables] / block_gdf["maup_assignment"].map(precinct_gdf[variables])
    block_votes = block_gdf.copy(deep=True)
    block_votes[election_columns] = maup.prorate(
        block_gdf["maup_assignment"], precinct_gdf[election_columns], bl_to_prec_weights
    )
    return block_votes


def run_maup(block_gdf, precinct_gdf, precinctid_input, distid_input, pop_variable):
    #Assign identifiers
    block_gdf = maup_assignment_series(block_gdf, precinct_gdf)
    block_gdf = maup_assign_labels(block_gdf, precinct_gdf, precinctid_input, distid_input)

    #Assign votes
    election_columns = election_cols(precinct_gdf)
    block_votes = assign_votes(pop_variable, election_columns, precinct_gdf, block_gdf)
    
    return block_votes    


def column_total_check(election_columns, block_gdf, precinct_gdf):
    for val in election_columns:
        vote_dif = block_gdf[val].sum()-precinct_gdf[val].sum()
        if (abs(vote_dif) <=1e-1):
            #print(val+": EQUAL", ' - total: ', 'block:', str(block_gdf[val].sum()), 'prec:', str(precinct_gdf[val].sum()), 'diff:', block_gdf[val].sum()-precinct_gdf[val].sum())
            continue
        else:
            print(val+": DIFFERENCE OF " + str(vote_dif)+ " VOTES", ' - block total: ', str(block_gdf[val].sum()), ', precinct total: ', str(precinct_gdf[val].sum()))     

In [None]:
print(prec_prim_cong_gdf.shape)
print(prec_gen_cong_gdf.shape)
print(prec_prim_sldl_gdf.shape)
print(prec_gen_sldl_gdf.shape)
print(prec_prim_sldu_gdf.shape)
print(prec_gen_sldu_gdf.shape) 
block_gdf['VAP_MOD'] = block_gdf['P0040001-P0050003']

bcp = maup_pre_vote_setup(block_gdf, prec_prim_cong_gdf, "PREC_CONG", "CONG_DIST", "CONG_DIST")
bcp_votes = assign_votes("VAP_MOD", election_cols(prec_prim_cong_gdf), prec_prim_cong_gdf, bcp)


bcg = maup_pre_vote_setup(block_gdf, prec_gen_cong_gdf, "PREC_CONG", "CONG_DIST", "CONG_DIST")
bcg_votes = assign_votes("VAP_MOD", election_cols(prec_gen_cong_gdf), prec_gen_cong_gdf, bcg)


bslp = maup_pre_vote_setup(block_gdf, prec_prim_sldl_gdf, "PREC_SLDL", "SLDL_DIST", "SLDL_DIST")
bslp_votes = assign_votes("VAP_MOD", election_cols(prec_prim_sldl_gdf), prec_prim_sldl_gdf, bslp)


bslg = maup_pre_vote_setup(block_gdf, prec_gen_sldl_gdf, "PREC_SLDL", "SLDL_DIST", "SLDL_DIST")
bslg_votes = assign_votes("VAP_MOD", election_cols(prec_gen_sldl_gdf), prec_gen_sldl_gdf, bslg)


bsup = maup_pre_vote_setup(block_gdf, prec_prim_sldu_gdf, "PREC_SLDU", "SLDU_DIST", "SLDU_DIST")
bsup_votes = assign_votes("VAP_MOD", election_cols(prec_prim_sldu_gdf), prec_prim_sldu_gdf, bsup)


bsug = maup_pre_vote_setup(block_gdf, prec_gen_sldu_gdf, "PREC_SLDU", "SLDU_DIST", "SLDU_DIST")
bsug_votes = assign_votes("VAP_MOD", election_cols(prec_gen_sldu_gdf), prec_gen_sldu_gdf, bsug)

try:
    column_total_check(election_cols(prec_prim_cong_gdf), bcp, prec_prim_cong_gdf)
    column_total_check(election_cols(prec_gen_cong_gdf), bcg, prec_gen_cong_gdf)
    column_total_check(election_cols(prec_prim_sldl_gdf), bslp, prec_prim_sldl_gdf)
    column_total_check(election_cols(prec_gen_sldl_gdf), bslg, prec_gen_sldl_gdf)
    column_total_check(election_cols(prec_prim_sldu_gdf), bsup, prec_prim_sldu_gdf)
    column_total_check(election_cols(prec_gen_sldu_gdf), bsug, prec_gen_sldu_gdf)
except:
    print('check failed')

(2675, 59)
(2681, 37)
(2675, 106)
(2708, 251)
(2666, 141)
(2662, 163)
block_gdf shape:  (236638, 10) 
 precinct_gdf shape:  (2675, 59)


100%|██████████| 2675/2675 [00:12<00:00, 205.87it/s]
100%|██████████| 2675/2675 [00:53<00:00, 49.88it/s]


0  null assignments in maup series
VAP_MOD  added to precinct_gdf based on maup assignment
block_gdf shape:  (236638, 10) 
 precinct_gdf shape:  (2681, 37)


100%|██████████| 2681/2681 [00:12<00:00, 207.27it/s]
100%|██████████| 2681/2681 [00:49<00:00, 54.65it/s] 


0  null assignments in maup series
VAP_MOD  added to precinct_gdf based on maup assignment
block_gdf shape:  (236638, 10) 
 precinct_gdf shape:  (2675, 106)


100%|██████████| 2675/2675 [00:13<00:00, 201.26it/s]
100%|██████████| 2675/2675 [00:52<00:00, 50.70it/s]


0  null assignments in maup series
VAP_MOD  added to precinct_gdf based on maup assignment
block_gdf shape:  (236638, 10) 
 precinct_gdf shape:  (2708, 251)


100%|██████████| 2708/2708 [00:13<00:00, 200.73it/s]
100%|██████████| 2708/2708 [00:49<00:00, 54.62it/s] 


0  null assignments in maup series
VAP_MOD  added to precinct_gdf based on maup assignment
block_gdf shape:  (236638, 10) 
 precinct_gdf shape:  (2666, 141)


100%|██████████| 2666/2666 [00:13<00:00, 201.59it/s]
100%|██████████| 2666/2666 [00:52<00:00, 50.39it/s]


0  null assignments in maup series
VAP_MOD  added to precinct_gdf based on maup assignment
block_gdf shape:  (236638, 10) 
 precinct_gdf shape:  (2662, 163)


100%|██████████| 2662/2662 [00:13<00:00, 201.90it/s]
100%|██████████| 2662/2662 [00:50<00:00, 52.83it/s] 


## Merge

In [None]:
#bcp_votes, bcg_votes, bslp_votes, bslg_votes, bsup_votes, bsug_votes
b_merge = bcp_votes.merge(bcg_votes, how='outer', on=["GEOID20", 'COUNTYFP20', 'COUNTY_NME', "VAP_MOD", "geometry"]).merge(
    bslp_votes, how='outer', on=["GEOID20", 'COUNTYFP20', 'COUNTY_NME', "VAP_MOD", "geometry"]).merge(
    bslg_votes, how='outer', on=["GEOID20", 'COUNTYFP20', 'COUNTY_NME', "VAP_MOD", "geometry"]).merge(
    bsup_votes, how='outer', on=["GEOID20", 'COUNTYFP20', 'COUNTY_NME', "VAP_MOD", "geometry"]).merge(
    bsug_votes, how='outer', on=["GEOID20", 'COUNTYFP20', 'COUNTY_NME', "VAP_MOD", "geometry"])

In [None]:
b_merge.head()

# Checks

In [18]:
def check_block_to_prec_pop_prop_sum(pop_col_list, VAP_MOD_COL, PREC_COL):
    #TODO change pop cols checking
    pop_col_list = pop_col_list + ['GEOID20', 'P0010001_sldu']
    popcd = b_merge[pop_col_list]
    prec_popcd = popcd.groupby(PREC_COL).sum().reset_index()

    prec_to_pop_dict_gcd = pd.Series(prec_popcd[VAP_MOD_COL].values, index = prec_popcd[PREC_COL]).to_dict()
    
    #print(prec_to_pop_dict_gcd)
    
    popcd['prec_pop'] = popcd[PREC_COL].map(prec_to_pop_dict_gcd)
    popcd['prop'] = popcd[VAP_MOD_COL]/popcd['prec_pop']
    
    print(popcd.groupby(PREC_COL).sum()['prop'].sum())
    print(popcd[PREC_COL].nunique())
    

print(check_block_to_prec_pop_prop_sum(['PREC_CONG', 'VAP_MOD'], 'VAP_MOD', 'PREC_CONG'))
print(check_block_to_prec_pop_prop_sum(['PREC_ST_SLDU', 'VAP_MOD'], 'VAP_MOD', 'PREC_ST_SLDU'))
print(check_block_to_prec_pop_prop_sum(['PREC_SLDL', 'VAP_MOD'], 'VAP_MOD', 'PREC_SLDL'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  popcd['prec_pop'] = popcd[PREC_COL].map(prec_to_pop_dict_gcd)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  popcd['prop'] = popcd[VAP_MOD_COL]/popcd['prec_pop']


9171.0
9172
None
9149.0
9150
None
9152.0
9153
None


Prec #s
(9177, 61)
(9153, 529)
(9150, 93)

In [19]:
#Column total checks
column_total_check(election_cols(cong20), b_merge, cong20)
column_total_check(election_cols(st_sldu20), b_merge, st_sldu20)
column_total_check(election_cols(sldl20), b_merge, sldl20)

In [23]:
def precinct_sum_check(prec_gdf, block_gdf, blk_prec_id):
    prec_gpbyprec = prec_gdf.groupby(['UNIQUE_ID']).sum()
    blk_gpbyprec = block_gdf.groupby([blk_prec_id]).sum()
    prec_w_difs_list = []
    
    prec_gpbyprec.sort_values(by='UNIQUE_ID',inplace=True)
    prec_gpbyprec.reset_index(inplace = True)
    blk_gpbyprec.sort_values(by= blk_prec_id, inplace=True)
    blk_gpbyprec.reset_index(inplace = True)
    
    for i in range(0, prec_gdf['UNIQUE_ID'].nunique()):
        for col in election_cols(prec_gdf):
            prec_diff = round(blk_gpbyprec[col].iloc[i], 1) - round(prec_gpbyprec[col].iloc[i], 1)
            if (prec_diff!=0):
                prec_w_difs_list.append(prec_gpbyprec.index[i])
                print("prec id: ", blk_gpbyprec.index[i], prec_gpbyprec.index[i], col, "block: ", round(blk_gpbyprec[col].iloc[i],1), 
                      "prec: ", round(prec_gpbyprec[col].iloc[i],1), "diff: ", prec_diff)
    print("precs to check: ", set(prec_w_difs_list))
    print("Precinct check complete!")


#Precinct total checks
precinct_sum_check(cong20, b_merge, "PREC_CONG")
precinct_sum_check(sldl20, b_merge, "PREC_SLDL")
precinct_sum_check(st_sldu20, b_merge, "PREC_ST_SLDU")

precs to check:  set()
Precinct check complete!
precs to check:  set()
Precinct check complete!
precs to check:  set()
Precinct check complete!


In [None]:
#Check for column name length

## Export

## Readme