In [1]:
import pandas as pd # standard python data library
import geopandas as gp # the geo-version of pandas
import numpy as np 
import os
import fiona
from statistics import mean, median
from pandas import read_csv
gp.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw' #To load KML files
import string
import xml.etree.ElementTree as et

pd.options.display.max_columns = 999

# Florida

## VEST Documentation

Election results from the Florida Department of State (https://dos.myflorida.com/elections/data-statistics/elections-data/precinct-level-election-results/)

Precinct results from several sources.  
Alachua, Bay, Bradford, Brevard, Calhoun, Citrus, Clay, Dixie, Escambia, Hardee, Hendry, Hernando, Indian River, Lafayette and Sarasota come from the Department of State.  
Broward, Gadsden, Gilchrist, Manatee, Nassau, Santa Rosa, St. Johns, St. Lucie, and Union are from the Orlando Sentinel's precinct map for the 2016 presidential primary (http://interactive.orlandosentinel.com/elections/2016/presidential-primary/results/dem.html).  
Baker, Charlotte, Franklin, Glades, Holmes, Jackson, Jefferson, Levy, Liberty, Madison, Monroe, Okeechobee, Orange, Seminole, Suwannee, Taylor, Wakulla, Walton, and Washinton come from the U.S. Census Bureau's 2020 Redistricting Data Program.  
Collier, Columbia, DeSoto, Duval, Flagler, Gulf, Highlands, Hillsborough, Lake, Lee, Leon, Marion, Martin, Miami-Dade, Okaloosa, Osceola, Palm Beach, Pasco, Pinellas, Polk, Putnam, Sumter, and Volusia come from the counties.
Hamilton was drawn based on a geocoded voter registration file.  
Edits were made to Brevard, Charlotte, Escambia, Franklin, Indian River, Jefferson, Lafayette, Lake, and Lee, Miami-Dade, and St. Johns based on the voter registration file.  

Lake 108, Osceola 999, Palm Beach 8002, Seminole 900s, and Monroe's "Cumulative" precinct (the latter appears on the county's detailed results but not the DOS's precinct results file) don't have geography. Brevard 999, Broward Z073, Collier 450, Flagler 999, Hillborough 999, Leon 9000, Miami-Dade 100, and Pinellas 512 do have geography, but they are just the county election offices. Both groups represent some type of vote not assigned to a particular geography, like UOCAVA results, so these were distributed across the county by candidate proportional to the vote each precinct recorded.

G20PRERTRU - Donald J. Trump (Republican Party)  
G20PREDBID - Joseph R. Biden (Democratic Party)  
G20PRELJOR - Jo Jorgensen (Libertarian Party)  
G20PREODEL - Roque "Rocky" De La Fuente (Reform Party)  
G20PRESLAR - Gloria La Riva (Party for Socialism and Liberation)  
G20PREGHAW - Howie Hawkin (Green Party)  
G20PRECBLA - Don Blankenship (Constitution Party)  
G20PREOWRI - Write-in Votes  

## Load VEST File

In [2]:
vest_fl_20 = gp.read_file("./raw-from-source/VEST/fl_2020/fl_2020.shp")

In [3]:
data_columns = [col for col in vest_fl_20.columns if "G20" in col]

## Load Election Results

Fields and codes data https://files.floridados.gov/media/694099/precinct-level-results-data-definition-field-codes.pdfF

In [4]:
#This Data is from the FL department of state, and can only be downloaded county by county
all_files = os.listdir("./raw-from-source/Election_Results/2020-general-election-rev/")

In [5]:
#FRA replace ""AJ"" w/ "AJ"

In [6]:
#Check that all files have the same number of columns
for i in all_files:
    if i !=".DS_Store":
        ref = "./raw-from-source/Election_Results/2020-general-election-rev/"
        file_ref = ref+i
        file_prev = pd.read_csv(file_ref,sep="\t",engine='python',index_col=None, header=None)
        print(file_prev.shape)
    
#All the files have 19 columns, so they should be good to combine

(803, 19)
(15647, 19)
(1189, 19)
(2632, 19)
(14289, 19)
(10445, 19)
(1756, 19)
(17138, 19)
(1242, 19)
(624, 19)
(774, 19)
(3108, 19)
(410, 19)
(86366, 19)
(2598, 19)
(4061, 19)
(11934, 19)
(2573, 19)
(13994, 19)
(51375, 19)
(28219, 19)
(1037, 19)
(796, 19)
(6699, 19)
(1805, 19)
(2965, 19)
(12062, 19)
(67003, 19)
(42208, 19)
(1154, 19)
(5629, 19)
(840, 19)
(7464, 19)
(25396, 19)
(3966, 19)
(884, 19)
(644, 19)
(888, 19)
(1908, 19)
(708, 19)
(4200, 19)
(3045, 19)
(1888, 19)
(2368, 19)
(2895, 19)
(1032, 19)
(1244, 19)
(9600, 19)
(11301, 19)
(1531, 19)
(538, 19)
(900, 19)
(6365, 19)
(1118, 19)
(908, 19)
(4810, 19)
(10570, 19)
(6289, 19)
(702, 19)
(3431, 19)
(8500, 19)
(1020, 19)
(1560, 19)
(9567, 19)
(1245, 19)
(1092, 19)
(740, 19)
(1268, 19)
(2099, 19)


In [7]:
li = []
for i in all_files:
    if i not in [".DS_Store","DAD_PctResults20201103recount.txt","BRO_PctResults20201103recount.txt"]:
        ref = "./raw-from-source/Election_Results/2020-general-election-rev/"
        file_ref = ref+i
        file_prev = pd.read_csv(file_ref,sep="\t",engine='python',index_col=None, header=None)
        li.append(file_prev)
frame = pd.concat(li, axis=0, ignore_index=True)
print(frame.shape)

(553307, 19)


In [8]:
frame =frame[frame[11]=='President of the United States']

In [9]:
filtered_frame = frame[~frame[14].isin(['OverVotes', 'UnderVotes'])]

In [10]:
filtered_frame[filtered_frame[0]=="DAD"][[5,6,7]]

Unnamed: 0,5,6,7
293943,10,PRECINCT 001.0,722
293944,20,PRECINCT 002.0,1671
293945,30,PRECINCT 003.0,2685
293946,40,PRECINCT 004.0,2614
293947,50,PRECINCT 005.0,3111
...,...,...,...
300866,9896,PRECINCT 994.0,889
300867,9897,PRECINCT 995.0,4728
300868,9898,PRECINCT 996.0,3087
300869,9899,PRECINCT 997.0,484


In [11]:
filtered_frame[6] = filtered_frame[6].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame[6] = filtered_frame[6].astype(str)


In [12]:
filtered_frame["mod_col"]=filtered_frame.apply(lambda row:int(float(row[6].split("PRECINCT ")[1])) if row[0]=="DAD" else row[5], axis=1)
#filtered_frame["mod_num"]=filtered_frame.apply(lambda x:x[6].split("PRECINCT ")[0] if x[0]=="DAD" else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame["mod_col"]=filtered_frame.apply(lambda row:int(float(row[6].split("PRECINCT ")[1])) if row[0]=="DAD" else row[5], axis=1)


In [13]:
filtered_frame[filtered_frame[0]=="DAD"]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,mod_col
293943,DAD,Miami-Dade,10866,11/03/2020,2020 General Election,10,PRECINCT 001.0,722,0,0,0,President of the United States,,100000,Trump / Pence,REP,0,74773,395,1
293944,DAD,Miami-Dade,10866,11/03/2020,2020 General Election,20,PRECINCT 002.0,1671,0,0,0,President of the United States,,100000,Trump / Pence,REP,0,74773,713,2
293945,DAD,Miami-Dade,10866,11/03/2020,2020 General Election,30,PRECINCT 003.0,2685,0,0,0,President of the United States,,100000,Trump / Pence,REP,0,74773,1320,3
293946,DAD,Miami-Dade,10866,11/03/2020,2020 General Election,40,PRECINCT 004.0,2614,0,0,0,President of the United States,,100000,Trump / Pence,REP,0,74773,976,4
293947,DAD,Miami-Dade,10866,11/03/2020,2020 General Election,50,PRECINCT 005.0,3111,0,0,0,President of the United States,,100000,Trump / Pence,REP,0,74773,1314,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
300866,DAD,Miami-Dade,10866,11/03/2020,2020 General Election,9896,PRECINCT 994.0,889,0,0,0,President of the United States,,100000,WriteinVotes,,0,900,0,994
300867,DAD,Miami-Dade,10866,11/03/2020,2020 General Election,9897,PRECINCT 995.0,4728,0,0,0,President of the United States,,100000,WriteinVotes,,0,900,8,995
300868,DAD,Miami-Dade,10866,11/03/2020,2020 General Election,9898,PRECINCT 996.0,3087,0,0,0,President of the United States,,100000,WriteinVotes,,0,900,8,996
300869,DAD,Miami-Dade,10866,11/03/2020,2020 General Election,9899,PRECINCT 997.0,484,0,0,0,President of the United States,,100000,WriteinVotes,,0,900,1,997


In [14]:
#Make the precinct column at least 4 digits
filtered_frame["mod_col"]=filtered_frame["mod_col"].astype(str)
filtered_frame["modified_pre"]=filtered_frame["mod_col"].str.zfill(4)

#Make a column with the 3 letter county code and the precincts
filtered_frame["Pct_std"]=filtered_frame[0]+filtered_frame["modified_pre"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame["mod_col"]=filtered_frame["mod_col"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame["modified_pre"]=filtered_frame["mod_col"].str.zfill(4)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame["Pct_std"]=filtered_frame[0]+filtered_frame["modified_pr

In [15]:
filtered_frame["Pct_std"]

0         OKE0001
1         OKE0002
2         OKE0003
3         OKE0004
4         OKE0005
           ...   
551403    GAD0021
551404    GAD0022
551405    GAD0023
551406    GAD0024
551407    GAD0025
Name: Pct_std, Length: 48776, dtype: object

In [16]:
#Pivot the data so that each row has all the results from that precinct
pivoted_2020 = pd.pivot_table(filtered_frame, values=[18], index=["Pct_std"],columns=[11,14],aggfunc=sum)

In [17]:
pivoted_2020.reset_index(inplace=True,drop=False)

In [18]:
pivoted_2020

Unnamed: 0_level_0,Pct_std,18,18,18,18,18,18,18,18
11,Unnamed: 1_level_1,President of the United States,President of the United States,President of the United States,President of the United States,President of the United States,President of the United States,President of the United States,President of the United States
14,Unnamed: 1_level_2,Biden / Harris,Blankenship / Mohr,De La Fuente / Richardson,Hawkins / Walker,Jorgensen / Cohen,La Riva / Freeman,Trump / Pence,WriteinVotes
0,ALA0001,424,0,2,1,6,1,725,1
1,ALA0002,752,1,0,3,18,0,969,8
2,ALA0003,1431,3,1,5,32,0,2036,10
3,ALA0004,990,1,1,4,37,1,1749,7
4,ALA0005,1789,0,0,12,36,0,624,18
...,...,...,...,...,...,...,...,...,...
6009,WAS0008,318,0,0,2,2,0,888,3
6010,WAS0009,139,1,0,0,8,1,655,0
6011,WAS0011,126,0,0,0,1,0,185,0
6012,WAS0012,210,0,1,3,5,2,1522,1


In [19]:
pivoted_2020.columns = ["pct_std","G20PREDBID","G20PRECBLA","G20PREODEL","G20PREGHAW","G20PRELJOR","G20PRESLAR","G20PRERTRU","G20PREOWRI"]





In [20]:
def statewide_totals_check(partner_df,source_df,column_list):
    """Compares the totals of two election result dataframes at the statewide total level

    Args:
      partner_df: DataFrame of election results we are comparing against
      source_df: DataFrame of election results we are comparing to
      column_list: List of races that there are votes for
 
    Returns:
      Nothing, only prints out an analysis
    """
    print("***Statewide Totals Check***")
    for race in column_list:
        if (partner_df[race].sum()- source_df[race].sum() != 0):
            print(race+" has a difference of "+str(partner_df[race].sum()-source_df[race].sum())+" votes")
            print("\tVEST: "+str(partner_df[race].sum())+" votes")
            print("\tSOURCES: "+str(source_df[race].sum())+" votes")
        else:
            print(race + " is equal", "\tVEST / RDH: " + str(partner_df[race].sum()))

In [21]:
statewide_totals_check(vest_fl_20,pivoted_2020,data_columns)

***Statewide Totals Check***
G20PRERTRU has a difference of 15 votes
	VEST: 5668731 votes
	SOURCES: 5668716 votes
G20PREDBID has a difference of 9 votes
	VEST: 5297045 votes
	SOURCES: 5297036 votes
G20PRELJOR has a difference of 1 votes
	VEST: 70324 votes
	SOURCES: 70323 votes
G20PREODEL is equal 	VEST / RDH: 5966
G20PRESLAR is equal 	VEST / RDH: 5712
G20PREGHAW is equal 	VEST / RDH: 14721
G20PRECBLA is equal 	VEST / RDH: 3902
G20PREOWRI is equal 	VEST / RDH: 24468


In [22]:
vest_fl_20

Unnamed: 0,pct_std,county,precinct,G20PRERTRU,G20PREDBID,G20PRELJOR,G20PREODEL,G20PRESLAR,G20PREGHAW,G20PRECBLA,G20PREOWRI,geometry
0,ALA0001,ALA,01,725,424,6,2,1,1,0,1,"POLYGON Z ((-82.24245 29.85246 0.00000, -82.24..."
1,ALA0002,ALA,02,969,752,18,0,0,3,1,8,"POLYGON Z ((-82.41775 29.92248 0.00000, -82.41..."
2,ALA0003,ALA,03,2036,1431,32,1,0,5,3,10,"POLYGON Z ((-82.53335 29.84801 0.00000, -82.52..."
3,ALA0004,ALA,04,1749,990,37,1,1,4,1,7,"POLYGON Z ((-82.55700 29.65461 0.00000, -82.55..."
4,ALA0005,ALA,05,624,1789,36,0,0,12,0,18,"POLYGON Z ((-82.34441 29.66672 0.00000, -82.34..."
...,...,...,...,...,...,...,...,...,...,...,...,...
6005,OSC0203,OSC,203,391,730,6,0,2,0,0,0,"MULTIPOLYGON Z (((-81.38822 28.32017 0.00000, ..."
6006,OSC0138,OSC,138,2,10,0,0,0,0,0,0,"POLYGON Z ((-81.44815 28.24601 0.00000, -81.45..."
6007,OSC0353,OSC,353,15,27,0,0,0,0,0,0,"POLYGON Z ((-81.44066 28.24602 0.00000, -81.44..."
6008,OSC0139,OSC,139,0,0,0,0,0,0,0,0,"POLYGON Z ((-81.45743 28.32764 0.00000, -81.45..."


In [23]:
pivoted_2020["county"] = pivoted_2020["pct_std"].str[0:3]

In [24]:
def county_totals_check(partner_df,source_df,column_list,county_col,full_print=False):
    """Compares the totals of two election result dataframes at the county level

    Args:
      partner_df: DataFrame of election results we are comparing against
      source_df: DataFrame of election results we are comparing to
      column_list: List of races that there are votes for
      county_col: String of the column name that contains county information
      full_print: Boolean specifying whether to print out everything, including counties w/ similarities

    Returns:
      Nothing, only prints out an analysis
    """
    
    print("***Countywide Totals Check***")
    print("")
    diff_counties=[]
    for race in column_list:
        diff = partner_df.groupby([county_col]).sum()[race]-source_df.groupby([county_col]).sum()[race]
        for val in diff[diff != 0].index.values.tolist():
            if val not in diff_counties:
                diff_counties.append(val)
        if len(diff[diff != 0]!=0):   
            print(race + " contains differences in these counties:")
            for val in diff[diff != 0].index.values.tolist():
                county_differences = diff[diff != 0]
                print("\t"+val+" has a difference of "+str(county_differences[val])+" votes")
                print("\t\tVEST: "+str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
                print("\t\tSOURCES: "+str(source_df.groupby([county_col]).sum().loc[val,race])+" votes")
            if (full_print):
                for val in diff[diff == 0].index.values.tolist():
                    county_similarities = diff[diff == 0]
                    print("\t"+val + ": "+ str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
        else:
            print(race + " is equal across all counties")
            if (full_print):
                for val in diff[diff == 0].index.values.tolist():
                    county_similarities = diff[diff == 0]
                    print("\t"+val + ": "+ str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")

In [25]:
county_totals_check(vest_fl_20,pivoted_2020,data_columns,"county",full_print=False)

***Countywide Totals Check***

G20PRERTRU contains differences in these counties:
	MON has a difference of 10 votes
		VEST: 25693 votes
		SOURCES: 25683 votes
	SEM has a difference of 5 votes
		VEST: 125241 votes
		SOURCES: 125236 votes
G20PREDBID contains differences in these counties:
	MON has a difference of 5 votes
		VEST: 21881 votes
		SOURCES: 21876 votes
	SEM has a difference of 4 votes
		VEST: 132528 votes
		SOURCES: 132524 votes
G20PRELJOR contains differences in these counties:
	MON has a difference of 1 votes
		VEST: 348 votes
		SOURCES: 347 votes
G20PREODEL is equal across all counties
G20PRESLAR is equal across all counties
G20PREGHAW is equal across all counties
G20PRECBLA is equal across all counties
G20PREOWRI is equal across all counties


### Precinct-by-Precinct Check

In [26]:
vest_fl_20["pct_std"].value_counts(dropna=False)

PAL7149    1
DAD0554    1
HIL0967    1
DAD0324    1
DAD0387    1
          ..
DAD0939    1
MRT0010    1
DAD0846    1
DAD0450    1
PIN0302    1
Name: pct_std, Length: 6010, dtype: int64

In [27]:
pivoted_2020["pct_std"].value_counts(dropna=False)

PAL7149    1
LEO5105    1
DAD0820    1
DAD0750    1
HIL0967    1
          ..
LEE0026    1
DAD0794    1
PAL3060    1
GLA0008    1
PIN0302    1
Name: pct_std, Length: 6014, dtype: int64

In [28]:
for i in data_columns:
    pivoted_2020.loc[pivoted_2020["pct_std"]=="CHA54.0",i]+=int(pivoted_2020.loc[pivoted_2020["pct_std"]=="CHA54.1",i])

In [29]:
pivoted_2020.loc[pivoted_2020["pct_std"]=="CHA54.0"]

Unnamed: 0,pct_std,G20PREDBID,G20PRECBLA,G20PREODEL,G20PREGHAW,G20PRELJOR,G20PRESLAR,G20PRERTRU,G20PREOWRI,county
938,CHA54.0,441,0,0,0,0,0,635,2,CHA


In [30]:
pivoted_2020 = pivoted_2020[pivoted_2020["pct_std"]!="CHA54.1"]

Lake 108, Osceola 999, Palm Beach 8002, Seminole 900s, and Monroe's "Cumulative" precinct (the latter appears on the county's detailed results but not the DOS's precinct results file) don't have geography. Brevard 999, Broward Z073, Collier 450, Flagler 999, Hillborough 999, Leon 9000, Miami-Dade 100, and Pinellas 512 do have geography, but they are just the county election offices. Both groups represent some type of vote not assigned to a particular geography, like UOCAVA results, so these were distributed across the county by candidate proportional to the vote each precinct recorded.

G20PRERTRU .................................................................BROZ073 (V) ....0  (S)..150 (D): -150
G20PREDBID .................................................................BROZ073 (V) ....0  (S)..262 (D): -262
G20PRERTRU .................................................................CHA0054 (V) ..635  (S)....0 (D):  635
G20PREDBID .................................................................CHA0054 (V) ..441  (S)....0 (D):  441
G20PRERTRU .................................................................CLL0450 (V) ....0  (S)...49 (D):  -49
G20PREDBID .................................................................CLL0450 (V) ....0  (S)...58 (D):  -58
G20PREDBID .................................................................HIL0946 (V) .4616  (S).4609 (D):    7
G20PRERTRU .................................................................HIL0999 (V) ....0  (S)..398 (D): -398
G20PREDBID .................................................................HIL0999 (V) ....0  (S)..533 (D): -533
G20PRELJOR .................................................................HIL0999 (V) ....0  (S)...27 (D):  -27
G20PREOWRI .................................................................HIL0999 (V) ....0  (S)....6 (D):   -6
G20PREDBID .................................................................LEO1321 (V) .2535  (S).2529 (D):    6
G20PREDBID .................................................................LEO2305 (V) .2786  (S).2780 (D):    6
G20PRERTRU .................................................................LEO9000 (V) ....0  (S)...90 (D):  -90
G20PREDBID .................................................................LEO9000 (V) ....0  (S)..233 (D): -233
G20PRERTRU .................................................................PIN0512 (V) ....0  (S)..355 (D): -355
G20PREDBID .................................................................PIN0512 (V) ....0  (S)..486 (D): -486
G20PRELJOR .................................................................PIN0512 (V) ....0  (S)...14 (D):  -14
G20PREOWRI .................................................................PIN0512 (V) ....0  (S)....8 (D):   -8

In [31]:
to_allocate = ['FLA0999',
 'LAK0108',
 'OSC0999',
 'PAL8001',
 'PAL8002',
 'SEM0900',
 'SEM0901',
 'SEM0902',
 'SEM0903',
 'SEM0904',
 'SEM0905',
 'SEM0906',
 'SEM0907',
 'SEM0908',
 'SEM0909',
 'SEM0910',
'BRE0999','DAD0100','BROZ073','CHA0054','CLL0450','HIL0999','LEO9000','PIN0512']

In [32]:
allocating_votes = pivoted_2020[pivoted_2020["pct_std"].isin(to_allocate)]
receiving_votes = pivoted_2020[~pivoted_2020["pct_std"].isin(to_allocate)]

In [33]:
def allocate_absentee(df_receiving_votes,df_allocating,column_list,col_allocating,allocating_to_all_empty_precs=False):
    """Allocates votes proportionally to precincts, usually by share of precinct-reported vote

    Args:
      df_receiving_votes: DataFrame with precinct-level votes
      df_allocating: DataFrame with the votes to allocate
      column_list: List of races that votes are being allocated for
      col_allocating: String referring to what level the allocation occurs at (most often county)
      allocating_to_all_empty_precs: Boolean for special case where all votes in df_receiving_votes are 0

    Returns:
      The precinct-level votes dataframe (df_receiving_votes) with the allocated votes
    """
    
    #Fill any n/a values with 0
    df_receiving_votes = df_receiving_votes.fillna(0)
    #Grab the original columns, so we can filter back down to them later
    original_cols = list(df_receiving_votes.columns)
    
    #Add in the "Total Votes column"
    if (allocating_to_all_empty_precs):
        #In cases where every vote is 0, need to set the Total_Votes equal to 1 for proportional allocation
        df_receiving_votes.loc[:,"Total_Votes"]=1
    else:
        df_receiving_votes.loc[:,"Total_Votes"]=0
        for race in column_list:
            df_receiving_votes.loc[:,"Total_Votes"]+=df_receiving_votes.loc[:,race]
    
    #Create the needed dataframes
    precinct_specific_totals = pd.DataFrame(df_receiving_votes.groupby([col_allocating]).sum())
    precinct_specific_totals.reset_index(drop=False,inplace=True)
    to_dole_out_totals = pd.DataFrame(df_allocating.groupby([col_allocating]).sum())
    to_dole_out_totals.reset_index(drop=False,inplace=True)
    
    #Add in total sum check
    sum_dataframe = pd.DataFrame(columns=precinct_specific_totals.columns)
    for i in column_list:
        total_votes = precinct_specific_totals.loc[:,i].sum()+to_dole_out_totals.loc[:,i].sum()
        sum_dataframe.at[0,i]=total_votes.astype(int)
    
    #Check the allocating to empty precincts code
    if (allocating_to_all_empty_precs):
        for i in column_list:
            if(sum(precinct_specific_totals[i])!=0):
                print("Allocating to all empty precincts parameter incorrect")
                break
    
    #Print out any instances where the allocation, as written, won't work
    special_allocation_needed = []
    for index, row in precinct_specific_totals.iterrows():
        for race in column_list:
            if (row[race]==0):
                race_district = row[col_allocating]
                if race_district in to_dole_out_totals[col_allocating].unique():
                    to_allocate = int(to_dole_out_totals.loc[to_dole_out_totals[col_allocating]==race_district][race])
                    if (to_allocate != 0):
                        special_allocation_needed.append([race_district,race])
                        if(row["Total_Votes"]==0):
                            precinct_specific_totals.loc[index,"Total_Votes"]=1
                            col_val = row[col_allocating]
                            df_receiving_votes.loc[df_receiving_votes[col_allocating]==col_val,"Total_Votes"]=1

    #Create some new columns for each of these races to deal with the allocation
    for race in column_list:
        add_var = race+"_add"
        rem_var = race+"_rem"
        floor_var = race+"_floor"
        df_receiving_votes.loc[:,add_var]=0.0
        df_receiving_votes.loc[:,rem_var]=0.0
        df_receiving_votes.loc[:,floor_var]=0.0

    #Iterate over the rows
    #Note this function iterates over the dataframe two times so the rounded vote totals match the totals to allocate
    for index, row in df_receiving_votes.iterrows():
        if row[col_allocating] in to_dole_out_totals[col_allocating].unique():
            for race in column_list:
                add_var = race+"_add"
                rem_var = race+"_rem"
                floor_var = race+"_floor"
                #Grab the district
                county_id = row[col_allocating]
                if [county_id,race] in special_allocation_needed:
                    #Get the denominator for the allocation - the summed "total votes" for precincts in that grouping
                    denom = precinct_specific_totals.loc[precinct_specific_totals[col_allocating]==county_id]["Total_Votes"]
                    #Get one of the numerators, how many districtwide votes to allocate
                    numer = to_dole_out_totals.loc[to_dole_out_totals[col_allocating]==county_id][race]
                    #Get the "total votes" for this particular precinct
                    val = df_receiving_votes.at[index,"Total_Votes"]
                    #Get the vote share, the precincts % of total precinct votes in the district times votes to allocate
                else:
                    #Get the denominator for the allocation (the precinct vote totals)
                    denom = precinct_specific_totals.loc[precinct_specific_totals[col_allocating]==county_id][race]
                    #Get one of the numerators, how many districtwide votes to allocate
                    numer = to_dole_out_totals.loc[to_dole_out_totals[col_allocating]==county_id][race]
                    #Get the vote totals for this race in this precinct
                    val = df_receiving_votes.at[index,race]
                    #Get the vote share, the precincts % of total precinct votes in the district times votes to allocate
                if ((float(denom)==0)):
                    vote_share = 0
                else:
                    vote_share = (float(val)/float(denom))*float(numer)
                df_receiving_votes.at[index,add_var] = vote_share
                #Take the decimal remainder of the allocation
                df_receiving_votes.at[index,rem_var] = vote_share%1
                #Take the floor of the allocation
                df_receiving_votes.at[index,floor_var] = np.floor(vote_share)

    #After the first pass through, get the sums of the races by district to assist in the rounding            
    first_allocation = pd.DataFrame(df_receiving_votes.groupby([col_allocating]).sum())

    #Now we want to iterate district by district to work on rounding
    county_list = list(to_dole_out_totals[col_allocating].unique()) 

    #Iterate over the district
    for county in county_list:
        for race in column_list:
            add_var = race+"_add"
            rem_var = race+"_rem"
            floor_var = race+"_floor"
            #County how many votes still need to be allocated (because we took the floor of all the initial allocations)
            to_go = int(np.round((int(to_dole_out_totals.loc[to_dole_out_totals[col_allocating]==county][race])-first_allocation.loc[first_allocation.index==county,floor_var])))
            #Grab the n precincts with the highest remainders and round these up, where n is the # of votes that still need to be allocated
            for index in df_receiving_votes.loc[df_receiving_votes[col_allocating]==county][rem_var].nlargest(to_go).index:
                df_receiving_votes.at[index,add_var] = np.ceil(df_receiving_votes.at[index,add_var])

    #Iterate over every race again
    for race in column_list:
        add_var = race+"_add"
        #Round every allocation down to not add fractional votes
        df_receiving_votes.loc[:,add_var]=np.floor(df_receiving_votes.loc[:,add_var])
        df_receiving_votes.loc[:,race]+=df_receiving_votes.loc[:,add_var]
        df_receiving_votes.loc[:,race] = df_receiving_votes.loc[:,race].astype(int)
        #Check to make sure all the votes have been allocated
        if ((sum_dataframe.loc[:,race].sum()-df_receiving_votes.loc[:,race].sum()!=0)):
            print("Some issue in allocating votes for:", i)
            
    #Filter down to original columns
    df_receiving_votes = df_receiving_votes[original_cols]

    return df_receiving_votes

In [34]:
pivoted_2020 = allocate_absentee(receiving_votes,allocating_votes,data_columns,"county",allocating_to_all_empty_precs=False)

In [35]:
join_attempt_one = pd.merge(vest_fl_20,pivoted_2020,how="outer",on="pct_std",indicator=True)

In [36]:
join_attempt_one["_merge"].value_counts()

both          5918
left_only       92
right_only      72
Name: _merge, dtype: int64

In [37]:
election_vest_id_changes = pd.read_csv("./election_vest_id_changes.csv")

In [38]:
election_vest_id_changes_dict = dict(zip(election_vest_id_changes["election_ID"],election_vest_id_changes["vest_ID"]))

In [39]:
pivoted_2020["pct_std"] = pivoted_2020["pct_std"].map(election_vest_id_changes_dict).fillna(pivoted_2020["pct_std"])

In [40]:
join_attempt_two = pd.merge(vest_fl_20,pivoted_2020,how="outer",on="pct_std",indicator=True)
join_attempt_two["_merge"].value_counts()

both          5990
left_only       20
right_only       0
Name: _merge, dtype: int64

In [41]:
def precinct_votes_check(merged_df,column_list,vest_on_left,name_col,print_level=0):
    """Checks a merged dataframe with two election results at the precinct level

    Args:
      merged_df: DataFrame with one set of election results joined to another
      column_list: List of races that there are votes for
      vest_on_left: Boolean specifying whether VEST data is on the left side of merged_df
      name_col: String of the column name to refer to precincts when a difference occurs
      print_level: Integer that specifies how large the vote difference in a precinct must be to be printed

    Returns:
      Nothing, only prints out an analysis
    """
    merged_df = merged_df.sort_values(by=[name_col],inplace=False)
    matching_rows = 0
    different_rows = 0
    diff_list=[]
    diff_values = []
    max_diff = 0
    for index,row in merged_df.iterrows():
        same = True
        for i in column_list:
            left_data = i + "_x"
            right_data = i + "_y"
            if ((row[left_data] is None) or (row[right_data] is None) or (np.isnan(row[right_data])or(np.isnan(row[left_data])))):
                print("FIX NaN value at: ", row[name_col])
                return;
            diff = abs(row[left_data]-row[right_data])
            if (diff>0):
                same = False
                diff_values.append(abs(diff))
                if (diff>max_diff):
                    max_diff = diff
            if(diff>print_level):
                if (vest_on_left):
                    print(i, "{:.>72}".format(row[name_col]), "(V)","{:.>5}".format(int(row[left_data]))," (S){:.>5}".format(int(row[right_data])),"(D):{:>5}".format(int(row[left_data]-row[right_data])))                           
                else:
                    print(i, "{:.>72}".format(row[name_col]), "(S)","{:.>5}".format(int(row[left_data]))," (V){:.>5}".format(int(row[right_data])),"(D):{:>5}".format(int(row[left_data]-row[right_data])))
        if(same != True):
            different_rows +=1
            diff_list.append(row[name_col])
        else:
            matching_rows +=1
    print("")
    print("There are ", len(merged_df.index)," total rows")
    print(different_rows," of these rows have election result differences")
    print(matching_rows," of these rows are the same")
    print("")
    print("The max difference between any one shared column in a row is: ", max_diff)
    if(len(diff_values)!=0):
        print("The average difference is: ", str(sum(diff_values)/len(diff_values)))
    count_big_diff = len([i for i in diff_values if i > 10])
    print("There are ", str(count_big_diff), "precinct results with a difference greater than 10")
    print("")
    print("All precincts containing differences:")
    diff_list.sort()
    print(diff_list)

In [42]:
precinct_votes_check(join_attempt_two[join_attempt_two["_merge"]=="both"],data_columns,True,"pct_std",print_level=0)

G20PRERTRU .................................................................MON0005 (V) .1000  (S)..999 (D):    1
G20PREDBID .................................................................MON0005 (V) .1706  (S).1705 (D):    1
G20PRELJOR .................................................................MON0005 (V) ...35  (S)...34 (D):    1
G20PREDBID .................................................................MON0006 (V) ..927  (S)..926 (D):    1
G20PREDBID .................................................................MON0008 (V) ..931  (S)..930 (D):    1
G20PREDBID .................................................................MON0010 (V) ..959  (S)..958 (D):    1
G20PRERTRU .................................................................MON0011 (V) .1213  (S).1212 (D):    1
G20PREDBID .................................................................MON0011 (V) .1101  (S).1100 (D):    1
G20PRERTRU .................................................................MON0012 (V) 

# Shapefiles

2020 doc:
    
Precinct results from several sources.  

4. Collier, Columbia, DeSoto, Duval, Flagler, Gulf, Highlands, Hillsborough, Lake, Lee, Leon, Marion, Martin, Miami-Dade, Okaloosa, Osceola, Palm Beach, Pasco, Pinellas, Polk, Putnam, Sumter, and Volusia come from the counties.
5. Hamilton was drawn based on a geocoded voter registration file.  

### 1. Department of State

Quote from the documentation:

> Alachua, Bay, Bradford, Brevard, Calhoun, Citrus, Clay, Dixie, Escambia, Hardee, Hendry, Hernando, Indian River, Lafayette and Sarasota come from the Department of State.  

In [51]:
ALA_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/ALA/ALA20121106v6_PctMap/ALA20121106v6_PctMap.shp")
BAY_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/BAY/BAY20120501_PctMap/CensusPrecinct_region.shp")
BRA_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/BRA/BRA20140106_PctMapaca38d35-1a03-4780-80bd-aa38a9b297c1/BRA20121106v6_PctMap/BRA20121106v5_PctMap.shp")
BRE_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/BRE/BRE20200430_Pct1370d2e2-064d-47b4-b963-43ee7731daee/BRE_Pct.shp")
CAL_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/CAL/CAL20121106v6_PctMap/CAL20121106v6_PctMap.shp")
CIT_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/CIT/CIT20200228_PctMapFile/PRECINCT2020.shp")
CLA_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/CLA/CLA20160503_PctMapcc84b5a6-3ade-4dd9-acd8-4c8157bcd343/CLA20160503_PctMap.shp")
DIX_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/DIX/DIX20121106v6_PctMap/DIX20121106v6_PctMap.shp")
ESC_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/ESC/Precinct_Map_2018/PRECINCT_MAP_022018.shp")
HAR_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/HAR/HAR20121106v6_PctMap/HAR20121106v6_PctMap.shp")
HEN_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/HEN/HEN20121106v6_PctMaped1303a5-dc65-41db-ac5f-a2bc3e07721a/HEN20121106v5_PctMap.shp")
HER_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/HER/HER20121106v6_PctMap/HER20121106v6_PctMap.shp")
IND_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/IND/IND20150210_PctMap/IND20150210_PctMap_region.shp")
LAF_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/LAF/LAF20121106v6_PctMap/LAF20121106v6_PctMap.shp")
SAR_pct=gp.read_file("./raw-from-source/Records_Request/Precinct and Polling Place Files - 2012-present - last checked 7-23-2020/SAR/SAR20140109v6_PctMapf831e47b-44f3-4954-a8e8-052b3a084322/SarasotaCounty_Pcts_051613_region.shp")

In [53]:
CLA_pct=CLA_pct.dissolve(by="PRECINCT")
CLA_pct.reset_index(inplace=True)

In [56]:
ALA_pct['new_county']='ALA'
BAY_pct['new_county']='BAY'
BRA_pct['new_county']='BRA'
BRE_pct['new_county']='BRE'
CAL_pct['new_county']='CAL'
CIT_pct['new_county']='CIT'
CLA_pct['new_county']='CLA'
DIX_pct['new_county']='DIX'
ESC_pct['new_county']='ESC'
HAR_pct['new_county']='HAR'
HEN_pct['new_county']='HEN'
HER_pct['new_county']='HER'
IND_pct['new_county']='IND'
LAF_pct['new_county']='LAF'
SAR_pct['new_county']='SAR'

ALA_pct=ALA_pct.to_crs(vest_fl_20.crs)
BAY_pct=BAY_pct.to_crs(vest_fl_20.crs)
BRA_pct=BRA_pct.to_crs(vest_fl_20.crs)
BRE_pct=BRE_pct.to_crs(vest_fl_20.crs)
CAL_pct=CAL_pct.to_crs(vest_fl_20.crs)
CIT_pct=CIT_pct.to_crs(vest_fl_20.crs)
CLA_pct=CLA_pct.to_crs(vest_fl_20.crs)
DIX_pct=DIX_pct.to_crs(vest_fl_20.crs)
ESC_pct=ESC_pct.to_crs(vest_fl_20.crs)
HAR_pct=HAR_pct.to_crs(vest_fl_20.crs)
HEN_pct=HEN_pct.to_crs(vest_fl_20.crs)
HER_pct=HER_pct.to_crs(vest_fl_20.crs)
IND_pct=IND_pct.to_crs(vest_fl_20.crs)
LAF_pct=LAF_pct.to_crs(vest_fl_20.crs)
SAR_pct=SAR_pct.to_crs(vest_fl_20.crs)


#### Create a list with these counties precincts

In [57]:
fl_2020_shapefiles = [ALA_pct,BAY_pct,BRA_pct,BRE_pct,CAL_pct,CIT_pct,CLA_pct,DIX_pct,ESC_pct,HAR_pct,
HEN_pct,HER_pct,LAF_pct,SAR_pct]

#### Clean up these precinct files

In [58]:
cleaned_fl_list = []
for i in fl_2020_shapefiles:
    i.rename(columns={"PRECINCT":"Precinct","PCT":"Precinct","DISTRICT":"Precinct",
        "PrecMay201":"Precinct","PCT2014":"Precinct",'PRECINCTID':"Precinct"},inplace=True)
    i=i[["Precinct","geometry","new_county"]]
    i.rename(columns={"new_county":"County"},inplace=True)
    cleaned_fl_list.append(i)
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [59]:
for i in cleaned_fl_list:
    if (i["County"].unique()==["BRE"]):
        i.loc[:,"Precinct"] = (i.loc[:,"Precinct"].astype(float)-.1).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


### 2. Orlando Sentinel Precinct Map

> Broward, Gadsden, Gilchrist, Manatee, Nassau, Santa Rosa, St. Johns, St. Lucie, and Union are from the Orlando Sentinel's precinct map for the 2016 presidential primary (http://interactive.orlandosentinel.com/elections/2016/presidential-primary/results/dem.html).

### 3. Census Source

Quote from the documentation:

> Baker, Charlotte, Franklin, Glades, Holmes, Jackson, Jefferson, Levy, Liberty, Madison, Monroe, Okeechobee, Orange, Seminole, Suwannee, Taylor, Wakulla, Walton, and Washinton come from the U.S. Census Bureau's 2020 Redistricting Data Program.

In [43]:
#When downloading from the Census redistricing data program, these use a FIPS code to identify counties

fips_codes = ["12015","12037","12043","12047","12059","12063","12065","12075","12077","12079","12093",
        "12095","12117","12121","12123","12129","12131","12133"]

#Combine all the data from separate files into one
li = []
for i in fips_codes:
    ref = "./raw-from-source/shapefiles/census/partnership_shapefiles_19v2_"
    file_ref = ref+i+"/PVS_19_v2_vtd_"+i+".shp"
    file_prev = gp.read_file(file_ref)
    li.append(file_prev)
shapefiles_census = pd.concat(li, axis=0, ignore_index=True)
#print(shapefiles_census.shape)

In [44]:
#Make a dictionary to convert from FIPS identifier to the 3 character county name
county_code = {'015':"CHA", '037':"FRA", '043':"GLA", '047':"HAM", '059':"HOL", '063':"JAC", '065':"JEF", 
               '075':"LEV", '077':"LIB", '079':"MAD", '093':"OKE", '095':"ORA",
 '117':"SEM", '121':"SUW", '123':"TAY", '129':"WAK", '131':"WAL", '133':"WAS"}

#Create a column with this 3-character county name
shapefiles_census['county_name'] = shapefiles_census['COUNTYFP'].map(county_code)

In [45]:
#Create a unique identifier, looks like "VTDST" is the best column to use for this, see above

#Work on the "NAMELSAD" column to just store this as well
shapefiles_census["NAMELSAD"]= shapefiles_census["NAMELSAD"].str.split("-", n = 1, expand = True)

#Take off the leading zero
shapefiles_census["VTDST"] = shapefiles_census["VTDST"].str.lstrip('0')

#Make sure they are all at least four digits
shapefiles_census["VTDST"] = shapefiles_census["VTDST"].str.zfill(4)
shapefiles_census["NAMELSAD"] = shapefiles_census["NAMELSAD"].str.zfill(4)

#Create the unique identifier
shapefiles_census["Pct_std"]=shapefiles_census["county_name"]+shapefiles_census["VTDST"]
shapefiles_census["alt_Pct_std"]=shapefiles_census["county_name"]+shapefiles_census["NAMELSAD"]

#Confirm that the unique identifier really is unique
print(shapefiles_census["Pct_std"].value_counts())

#Filter the columns to only include 'Pct_std' and geometry
shapefiles_census=shapefiles_census[["Pct_std","geometry"]]

#Edit the CRS, so it matches that of the VEST file
shapefiles_census = shapefiles_census.to_crs(vest_fl_20.crs)

JAC0014    1
SEM0071    1
TAY0010    1
ORA240A    1
WAS0019    1
          ..
ORA0403    1
SEM0020    1
CHA0002    1
SEM0078    1
ORA0223    1
Name: Pct_std, Length: 673, dtype: int64


In [47]:
#Filter down the 2018 election results to the relevant counties where shapefiles are from the Census Redist. Data Program
#(These are the only ones with a chance of matching)
census_counties = ["CHA","FRA","GLA","HAM","HOL","JAC","JEF","LEV","LIB","MAD","OKE","ORA","SEM","SUW","TAY","WAK","WAL","WAS"]
election_census=pivoted_2020[pivoted_2020["county"].isin(census_counties)]

### 4. Counties Themselves (this is still from 2018 work)

> Collier, Columbia, DeSoto, Duval, Flagler, Gulf, Highlands, Hillsborough, Lake, Lee, Leon, Marion, Martin, Miami-Dade, Okaloosa, Osceola, Palm Beach, Pasco, Pinellas, Polk, Putnam, Sumter, and Volusia come from the counties.

What I was able to find:

- Collier - (downloaded)    
- Flagler - (downloaded) 
- Highlands - (not found)  
- Hillsborough - (downloaded)   
- Lake - (not found)  
- Lee - (downloaded) 
- Leon - (downloaded)  
- Miami-Dade  - (downloaded)   
- Okaloosa  - (downloaded)  
- Osceola  -  (couldn't find)  
- Palm Beach  -  (downloaded)  
- Pasco  -  (downloaded) 
- Sumter  - (couldn't find)  
- Volusia  -  (couldn't find)  
- Columbia -  (couldn't find)   
- Duval -  (couldn't find)  


#### Load all the files

In [None]:
#For Collier, I had to load the KML into Google Earth Pro and then export it to get the precinct labels to show up (see report)
shapefiles_collier = gp.read_file("./raw-from-source/shapefiles/counties/Collier/Precinct Boundaries_collier.kml")
flagler_pcts = gp.read_file("./raw-from-source/shapefiles/counties/Flagler/2018-02-05/VTDBLK_1_region.shp")
shapefiles_hillsborough = gp.read_file("./raw-from-source/shapefiles/counties/Hillsborough/2017ShapeFiles/PRECINCT12057_region.shp")
shapefiles_lee = gp.read_file("./raw-from-source/shapefiles/counties/Lee/Lee County Precincts.kml",driver='KML',split="<br>")
shapefiles_leon = gp.read_file("./raw-from-source/shapefiles/counties/Leon/Election_Precincts_-_Leon_County-shp/Election_Precincts_-_Leon_County.shp")
shapefiles_miami = gp.read_file("./raw-from-source/shapefiles/counties/Miami/Miami_Precinct-shp/Precinct.shp")
shapefiles_okaloosa = gp.read_file("./raw-from-source/shapefiles/counties/Okaloosa/Voting/precinct.shp")
shapefiles_palm = gp.read_file("./raw-from-source/shapefiles/counties/Palm/Palm Beach SOE Shapefiles 2021/Precincts 2021.shp")
shapefiles_pasco = gp.read_file("./raw-from-source/shapefiles/counties/Pasco/Pasco VotingPrecincts/VotingPrecincts_12112020.shp")

#### Collier County

In [None]:
#Take a look at the file
#print(shapefiles_collier.head())

#The "Name" column seems like the best one to use, but needs to be cleaned
#print(shapefiles_collier["Name"])
shapefiles_collier["Name"] = shapefiles_collier["Name"].str.replace('Precinct ','')

#Make sure the name column has at least four characters
shapefiles_collier["Name"]= shapefiles_collier["Name"].str.zfill(4)

#Create the unique identifier
shapefiles_collier["Pct_std"]="CLL"+shapefiles_collier["Name"]

#Filter the columns to only include 'Pct_std' and geometry
shapefiles_collier=shapefiles_collier[["Pct_std","geometry"]]

#Edit the CRS, so it matches that of the VEST file
shapefiles_collier = shapefiles_collier.to_crs(vest_proj)

#Filter down the election results
election_collier = pivoted_2018[pivoted_2018["County"]=="CLL"]

#Merge data
merged_data_collier = pd.merge(election_collier,shapefiles_collier,on=['Pct_std'],how='outer',indicator=True)
collier_elections_only = merged_data_collier[merged_data_collier["_merge"]=="left_only"]['Pct_std']
collier_shapefile_only = merged_data_collier[merged_data_collier["_merge"]=="right_only"]['Pct_std']
collier_both = merged_data_collier[merged_data_collier["_merge"]=="both"]['Pct_std']

#Print diffrences
print("There are " + str(collier_elections_only.count()) + " precincts that only appear in the election results")
print("There are " + str(collier_shapefile_only.count()) + " precincts that only appear in the shapefile")
print("There are " + str(collier_both.count()) + " precincts that were matched between the two files")


In [None]:
merged_data_collier.loc[merged_data_collier['_merge'] != 'both'][['County','Pct_std','_merge']]\
    .sample(3).sort_values(by=['_merge','County','Pct_std'])

#### Flagler County

In [None]:
#Take a look at the shapefile
#print(flagler_pcts.head())
#print(flagler_pcts.shape)

#"PRECINCT" is the right column to use, but it isn't unique (needs to be grouped by this)
#print(flagler_pcts["PRECINCT"].value_counts())

#Group by precinct # and reset index
shapefiles_flagler = flagler_pcts.dissolve(by="PRECINCT")
shapefiles_flagler = shapefiles_flagler.reset_index()

#Edit the precinct column to contain at least 4 character
shapefiles_flagler["PRECINCT"]= shapefiles_flagler["PRECINCT"].str.zfill(4)

#Create a unique identifer
shapefiles_flagler["Pct_std"]="FLA"+shapefiles_flagler["PRECINCT"]

#Filter the columns to only include 'Pct_std' and geometry
shapefiles_flagler=shapefiles_flagler[["Pct_std","geometry"]]

#Edit the CRS, so it matches that of the VEST file
shapefiles_flagler = shapefiles_flagler.to_crs(vest_proj)

#Check this looks okay
#print(shapefiles_flagler.head())

#Filter down the election results
election_flagler = pivoted_2018[pivoted_2018["County"]=="FLA"]

#Merge data
merged_data_flagler = pd.merge(election_flagler,shapefiles_flagler,on=['Pct_std'],how='outer',indicator=True)
flagler_elections_only = merged_data_flagler[merged_data_flagler["_merge"]=="left_only"]['Pct_std']
flagler_shapefile_only = merged_data_flagler[merged_data_flagler["_merge"]=="right_only"]['Pct_std']
flagler_both = merged_data_flagler[merged_data_flagler["_merge"]=="both"]['Pct_std']

#Print diffrences
print("There are " + str(flagler_elections_only.count()) + " precincts that only appear in the election results")
print("There are " + str(flagler_shapefile_only.count()) + " precincts that only appear in the shapefile")
print("There are " + str(flagler_both.count()) + " precincts that were matched between the two files")

In [None]:
merged_data_flagler.loc[merged_data_flagler['_merge'] != 'both'][['County','Pct_std','_merge']]\
    .sample(2).sort_values(by=['_merge','County','Pct_std'])

#### Hillsborough County

In [None]:
#Take a look
#print(shapefiles_hillsborough.head())

#Use the "PRECINCT" column
#print(shapefiles_hillsborough["PRECINCT"].shape)

#Edit the precinct column to contain at least 4 character
shapefiles_hillsborough["PRECINCT"]= shapefiles_hillsborough["PRECINCT"].str.zfill(4)

#Create the unique identifer column
shapefiles_hillsborough["Pct_std"]="HIL"+shapefiles_hillsborough["PRECINCT"]

#Filter the columns to only include 'Pct_std' and geometry
shapefiles_hillsborough=shapefiles_hillsborough[["Pct_std","geometry"]]

#Edit the CRS, so it matches that of the VEST file
shapefiles_hillsborough = shapefiles_hillsborough.to_crs(vest_proj)

#Filter down the election results
election_hillsborough = pivoted_2018[pivoted_2018["County"]=="HIL"]

#Merge data
merged_data_hillsborough = pd.merge(election_hillsborough,shapefiles_hillsborough,on=['Pct_std'],how='outer',indicator=True)
hillsborough_elections_only = merged_data_hillsborough[merged_data_hillsborough["_merge"]=="left_only"]['Pct_std']
hillsborough_shapefile_only = merged_data_hillsborough[merged_data_hillsborough["_merge"]=="right_only"]['Pct_std']
hillsborough_both = merged_data_hillsborough[merged_data_hillsborough["_merge"]=="both"]['Pct_std']

#Print diffrences
print("There are " + str(hillsborough_elections_only.count()) + " precincts that only appear in the election results")
print("There are " + str(hillsborough_shapefile_only.count()) + " precincts that only appear in the shapefile")
print("There are " + str(hillsborough_both.count()) + " precincts that were matched between the two files")

#### Lee County

In [None]:
#Take a look
#print(shapefiles_lee.head())

#From examining the map, only concerned with those that don't start with "()", as those designate a voting place, not an entire precinct
shapefiles_lee["First_char"] = shapefiles_lee["Name"].astype(str).str[0]=="("
shapefiles_lee = shapefiles_lee[shapefiles_lee["First_char"]==False]

#Now can use the "Name" column to create the unique identifier
shapefiles_lee["Name"]= shapefiles_lee["Name"].str.zfill(4)

#Create the unique identifer column
shapefiles_lee["Pct_std"]="LEE"+shapefiles_lee["Name"]

#Filter the columns to only include 'Pct_std' and geometry
shapefiles_lee=shapefiles_lee[["Pct_std","geometry"]]

#Edit the CRS, so it matches that of the VEST file
shapefiles_lee = shapefiles_lee.to_crs(vest_proj)

#Filter down the election results
election_lee = pivoted_2018[pivoted_2018["County"]=="LEE"]

#Merge data
merged_data_lee = pd.merge(election_lee,shapefiles_lee,on=['Pct_std'],how='outer',indicator=True)
lee_elections_only = merged_data_lee[merged_data_lee["_merge"]=="left_only"]['Pct_std']
lee_shapefile_only = merged_data_lee[merged_data_lee["_merge"]=="right_only"]['Pct_std']
lee_both = merged_data_lee[merged_data_lee["_merge"]=="both"]['Pct_std']

#Print diffrences
print("There are " + str(lee_elections_only.count()) + " precincts that only appear in the election results")
print("There are " + str(lee_shapefile_only.count()) + " precincts that only appear in the shapefile")
print("There are " + str(lee_both.count()) + " precincts that were matched between the two files")

#### Leon County

In [None]:
#Take a look at the file
#print(shapefiles_leon.head())

#Edit the precinct column to contain at least 4 character
shapefiles_leon["PRECINCT"]= shapefiles_leon["PRECINCT"].str.zfill(4)

#Create the unique identifer column
shapefiles_leon["Pct_std"]="LEO"+shapefiles_leon["PRECINCT"]

#Filter the columns to only include 'Pct_std' and geometry
shapefiles_leon=shapefiles_leon[["Pct_std","geometry"]]

#Edit the CRS, so it matches that of the VEST file
shapefiles_leon = shapefiles_leon.to_crs(vest_proj)

#Filter down the election results
election_leon = pivoted_2018[pivoted_2018["County"]=="LEO"]

#Merge data
merged_data_leon = pd.merge(election_leon,shapefiles_leon,on=['Pct_std'],how='outer',indicator=True)
leon_elections_only = merged_data_leon[merged_data_leon["_merge"]=="left_only"]['Pct_std']
leon_shapefile_only = merged_data_leon[merged_data_leon["_merge"]=="right_only"]['Pct_std']
leon_both = merged_data_leon[merged_data_leon["_merge"]=="both"]['Pct_std']

#Print diffrences
print("There are " + str(leon_elections_only.count()) + " precincts that only appear in the election results")
print("There are " + str(leon_shapefile_only.count()) + " precincts that only appear in the shapefile")
print("There are " + str(leon_both.count()) + " precincts that were matched between the two files")

In [None]:
merged_data_leon.loc[merged_data_leon['_merge'] != 'both'][['County','Pct_std','_merge']]\
    .sample(9).sort_values(by=['_merge','County','Pct_std'])

#### Miami-Dade County

In [None]:
#Take a look, looks like "ID" is an okay column to use
#print(shapefiles_miami.head())

#Convert the ID column to contain at least 4 character
shapefiles_miami["ID"] = shapefiles_miami["ID"].apply(str)
shapefiles_miami["ID"] = shapefiles_miami["ID"].str.zfill(4)

#Create the unique identifer column
shapefiles_miami["Pct_std"]="DAD"+shapefiles_miami["ID"]

#Filter the columns to only include 'Pct_std' and geometry
shapefiles_miami=shapefiles_miami[["Pct_std","geometry"]]

#Edit the CRS, so it matches that of the VEST file
shapefiles_miami = shapefiles_miami.to_crs(vest_proj)

#Filter down the election results
election_miami = pivoted_2018[pivoted_2018["County"]=="DAD"]

#Merge data
merged_data_miami = pd.merge(election_miami,shapefiles_miami,on=['Pct_std'],how='outer',indicator=True)
miami_elections_only = merged_data_miami[merged_data_miami["_merge"]=="left_only"]['Pct_std']
miami_shapefile_only = merged_data_miami[merged_data_miami["_merge"]=="right_only"]['Pct_std']
miami_both = merged_data_miami[merged_data_miami["_merge"]=="both"]['Pct_std']

#Print diffrences
print("There are " + str(miami_elections_only.count()) + " precincts that only appear in the election results")
print("There are " + str(miami_shapefile_only.count()) + " precincts that only appear in the shapefile")
print("There are " + str(miami_both.count()) + " precincts that were matched between the two files")

#### Okaloosa County

In [None]:
#Take a look
#print(shapefiles_okaloosa.head())

#Looks like "NO" is the right column, convert to string and edit to contain at least 4 characters
shapefiles_okaloosa["NO"] = shapefiles_okaloosa["NO"].apply(str)
shapefiles_okaloosa["NO"] = shapefiles_okaloosa["NO"].str.zfill(4)

#Create the unique identifer column
shapefiles_okaloosa["Pct_std"]="OKA"+shapefiles_okaloosa["NO"]

#Filter the columns to only include 'Pct_std' and geometry
shapefiles_okaloosa=shapefiles_okaloosa[["Pct_std","geometry"]]

#Edit the CRS, so it matches that of the VEST file
shapefiles_okaloosa = shapefiles_okaloosa.to_crs(vest_proj)

#Filter down the election results
election_okaloosa = pivoted_2018[pivoted_2018["County"]=="OKA"]

#Merge data
merged_data_okaloosa = pd.merge(election_okaloosa,shapefiles_okaloosa,on=['Pct_std'],how='outer',indicator=True)
okaloosa_elections_only = merged_data_okaloosa[merged_data_okaloosa["_merge"]=="left_only"]['Pct_std']
okaloosa_shapefile_only = merged_data_okaloosa[merged_data_okaloosa["_merge"]=="right_only"]['Pct_std']
okaloosa_both = merged_data_okaloosa[merged_data_okaloosa["_merge"]=="both"]['Pct_std']

#Print diffrences
print("There are " + str(okaloosa_elections_only.count()) + " precincts that only appear in the election results")
print("There are " + str(okaloosa_shapefile_only.count()) + " precincts that only appear in the shapefile")
print("There are " + str(okaloosa_both.count()) + " precincts that were matched between the two files")

#### Palm Beach County

In [None]:
#Take a look, looks like "PRECINCT" is the column to use
#print(shapefiles_palm.head())

#Edit the precinct column to contain at least 4 character
shapefiles_palm["PRECINCT"]= shapefiles_palm["PRECINCT"].str.zfill(4)

#Create the unique identifer column
shapefiles_palm["Pct_std"]="PAL"+shapefiles_palm["PRECINCT"]

#Filter the columns to only include 'Pct_std' and geometry
shapefiles_palm=shapefiles_palm[["Pct_std","geometry"]]

#Edit the CRS, so it matches that of the VEST file
shapefiles_palm = shapefiles_palm.to_crs(vest_proj)

#Filter down the election results
election_palm = pivoted_2018[pivoted_2018["County"]=="PAL"]

#Merge data
merged_data_palm = pd.merge(election_palm,shapefiles_palm,on=['Pct_std'],how='outer',indicator=True)
palm_elections_only = merged_data_palm[merged_data_palm["_merge"]=="left_only"]['Pct_std']
palm_shapefile_only = merged_data_palm[merged_data_palm["_merge"]=="right_only"]['Pct_std']
palm_both = merged_data_palm[merged_data_palm["_merge"]=="both"]['Pct_std']

#Print diffrences
print("There are " + str(palm_elections_only.count()) + " precincts that only appear in the election results")
print("There are " + str(palm_shapefile_only.count()) + " precincts that only appear in the shapefile")
print("There are " + str(palm_both.count()) + " precincts that were matched between the two files")

In [None]:
merged_data_palm.loc[merged_data_palm['_merge'] != 'both'][['County','Pct_std','_merge']]\
    .sample(9).sort_values(by=['_merge','County','Pct_std'])

#### Pasco County

In [None]:
#Take a look
#print(shapefiles_pasco.head())

#Seems like "Precinct" is the right column to use and that it is a unique value
#print(shapefiles_pasco["Precinct"].value_counts())

#Edit the precinct column to contain at least 4 character
shapefiles_pasco["Precinct"]= shapefiles_pasco["Precinct"].str.zfill(4)

#Create the unique identifer column
shapefiles_pasco["Pct_std"]="PAS"+shapefiles_pasco["Precinct"]

#Filter the columns to only include 'Pct_std' and geometry
shapefiles_pasco=shapefiles_pasco[["Pct_std","geometry"]]

#Take out that doesn't have any geometry (causing an issue)
shapefiles_pasco= shapefiles_pasco.drop([99])

#Edit the CRS, so it matches that of the VEST file
shapefiles_pasco = shapefiles_pasco.to_crs(vest_proj)

#Filter down the election results
election_pasco = pivoted_2018[pivoted_2018["County"]=="PAS"]

#Merge data
merged_data_pasco = pd.merge(election_pasco,shapefiles_pasco,on=['Pct_std'],how='outer',indicator=True)
pasco_elections_only = merged_data_pasco[merged_data_pasco["_merge"]=="left_only"]['Pct_std']
pasco_shapefile_only = merged_data_pasco[merged_data_pasco["_merge"]=="right_only"]['Pct_std']
pasco_both = merged_data_pasco[merged_data_pasco["_merge"]=="both"]['Pct_std']

#Print diffrences
print("There are " + str(pasco_elections_only.count()) + " precincts that only appear in the election results")
print("There are " + str(pasco_shapefile_only.count()) + " precincts that only appear in the shapefile")
print("There are " + str(pasco_both.count()) + " precincts that were matched between the two files")

In [None]:
merged_data_pasco.loc[merged_data_pasco['_merge'] != 'both'][['County','Pct_std','_merge']]\
    .head(10).sort_values(by=['_merge','County','Pct_std'])

### 5. Geocoded Registration File

> Hamilton was drawn based on a geocoded voter registration file.  

Note: We are not able to create this file without more information