# Florida 2022 Election Results Processing
Precinct-level data from [Florida Division of Elections](https://dos.myflorida.com/elections/data-statistics/elections-data/precinct-level-election-results/)

County-level data to run checks from [Florida Department of State Election Archive](https://results.elections.myflorida.com/Index.asp?ElectionDate=8/23/2022&DATAMODE=)

Precinct-level data for Walton County added from [Walton County Supervisor of Elections](https://enr.electionsfl.org/WAL/3248/Reports/)
<br>\----

Code based on [similar processing](https://github.com/nonpartisan-redistricting-datahub/pber_collection/blob/main/FL/2022/fl_2022_results.ipynb) of 2022 FL General Election results by Peter Horton

## Setup

Import libraries, load in original precinct-level data, set directories

In [1]:
import pandas as pd
import os
import numpy as np
import re

#Stop warnings about future changes to libraries from crowding output
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
#Set the working directory
project_folder = '/Users/grantschwab/Desktop/RDH/Projects/florida_primary2022/update'
os.chdir(project_folder)

wd = os.getcwd()

In [3]:
#This Data is from the FL department of state, and can only be downloaded county by county
all_files = os.listdir('./raw-from-source/2022-pri-outputofficial')

#Looping through files to import, excluding recounts
li = []
for i in all_files:
    if i not in [".DS_Store",
                 'CLA_PctResults20220823_Recount.txt',
                 'HIL_PctResults20220823_Recount.txt',
                 'NAS_PctResults20220823_Recount.txt',
                 'ORA_PctResults20220823_Recount.txt']:
        ref = './raw-from-source/2022-pri-outputofficial/'
        file_ref = ref+i
        file_prev = pd.read_csv(file_ref,sep="\t",engine='python',index_col=None, header=None, dtype = str)
        li.append(file_prev)
frame = pd.concat(li, axis=0, ignore_index=True)
print(frame.shape)

(363392, 19)


In [4]:
#See data
frame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,HOL,Holmes,,08/23/2022,2022 Primary Election,1,Ponce de Leon,1234,0,252,0,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,4
1,HOL,Holmes,,08/23/2022,2022 Primary Election,2,Pine Log,1180,0,229,0,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,2
2,HOL,Holmes,,08/23/2022,2022 Primary Election,3,New Hope,946,0,224,0,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,6
3,HOL,Holmes,,08/23/2022,2022 Primary Election,4,Bethlehem,1582,0,297,0,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,4
4,HOL,Holmes,,08/23/2022,2022 Primary Election,5,Gritney,623,0,117,0,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,1


## Process original precinct-level results

#### Initial cleaning

In [5]:
#Rename the columns in the file

col_rename_dict = {0: 'County Code (Three-character abbreviation)',
 1: 'County Name',
 2: 'Election Number',
 3: 'Election Date',
 4: 'Election Name',
 5: 'Precinct',
 6: 'Precinct Polling Location',
 7: 'Total Registered Voters',
 8: 'Total Registered Republicans', #This seems to be missing or underpopulated
 9: 'Total Registered Democrats',
 10: 'Total Registered All Other Parties',
 11: 'Contest Name',
 12: 'District',
 13: 'Contest Code (Florida’s 6 digit contest codes)',
 14: 'Candidate/Retention/IssueName/WriteInsCast/OverVotes/UnderVotes',
 15: 'Candidate Party (abbreviation)',
 16: 'Candidate Florida Voter Registration System ID Number',
 17: 'DOE Assigned Candidate Number or Retention/Issue Number',
 18: 'Vote Total'}

In [6]:
#Rename the columns and clean the dataframe
frame.rename(columns = col_rename_dict, inplace = True)
frame['Vote Total'] = frame['Vote Total'].fillna(0)
frame['Vote Total'] = frame['Vote Total'].astype(int)

#Set the Precinct to the Precinct Polling Location, where the Unique Identifier is "N/A"
frame["Precinct"] = np.where(frame["Precinct"].isna(), frame["Precinct Polling Location"], frame["Precinct"])

In [7]:
display(frame.head())
frame['Contest Name'].unique()

Unnamed: 0,County Code (Three-character abbreviation),County Name,Election Number,Election Date,Election Name,Precinct,Precinct Polling Location,Total Registered Voters,Total Registered Republicans,Total Registered Democrats,Total Registered All Other Parties,Contest Name,District,Contest Code (Florida’s 6 digit contest codes),Candidate/Retention/IssueName/WriteInsCast/OverVotes/UnderVotes,Candidate Party (abbreviation),Candidate Florida Voter Registration System ID Number,DOE Assigned Candidate Number or Retention/Issue Number,Vote Total
0,HOL,Holmes,,08/23/2022,2022 Primary Election,1,Ponce de Leon,1234,0,252,0,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,4
1,HOL,Holmes,,08/23/2022,2022 Primary Election,2,Pine Log,1180,0,229,0,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,2
2,HOL,Holmes,,08/23/2022,2022 Primary Election,3,New Hope,946,0,224,0,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,6
3,HOL,Holmes,,08/23/2022,2022 Primary Election,4,Bethlehem,1582,0,297,0,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,4
4,HOL,Holmes,,08/23/2022,2022 Primary Election,5,Gritney,623,0,117,0,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,1


array(['United States Senator', 'Governor', 'Attorney General',
       'Commissioner of Agriculture', 'State Senator',
       'State Representative', 'County Commissioner', 'School Board',
       'Representative in Congress', 'Circuit Judge',
       'Property Appraiser', 'County Court Judge',
       'Supervisor of Elections', 'Board of County Commissioners',
       'Port of Palm Beach', 'Indian Trail Improvement District',
       'Sarasota County Charter Review Board',
       'Sarasota County Public Hospital Board',
       'Ocean Highway and Port Authority',
       'Clerk of the Circuit Court and Comptroller',
       'North Lake County Hospital District', 'County Mayor',
       'Hendry County Hospital Authority', 'County Council',
       'Charlotte County Airport Authority'], dtype=object)

In [8]:
#Helpful for making informed assessment of which races to include in file
unique_precincts = list(frame['Precinct'].unique())
frame['Contest Name'] = frame['Contest Name'].astype(str)
all_contests = sorted(list(frame['Contest Name'].unique()))
len_all_contests = len(all_contests)
listofzeros = [0] * len_all_contests
num_precs = len(unique_precincts)
contests_dict = dict(zip(all_contests,listofzeros))
in_all_precincts = []
for i in unique_precincts:
    sub_df = frame[frame['Precinct']==i]
    contests = list(sub_df['Contest Name'].unique())
    for contest in contests:
        contests_dict[contest] = int(contests_dict.get(contest))+1
contests_keep_guess =[]
for k,v in contests_dict.items():
    if v>(num_precs*.9):
        print(k,'\t',v)
        contests_keep_guess.append(k)
print(contests_keep_guess)

Attorney General 	 2942
Circuit Judge 	 2875
Commissioner of Agriculture 	 2942
County Court Judge 	 2836
Governor 	 2942
Representative in Congress 	 2942
United States Senator 	 2943
['Attorney General', 'Circuit Judge', 'Commissioner of Agriculture', 'County Court Judge', 'Governor', 'Representative in Congress', 'United States Senator']


In [9]:
contests_keep = ['United States Senator', 'Representative in Congress', 'United States Representative', 'Governor','Attorney General',
                 'Commissioner of Agriculture','State Representative', 'State Senator']
print(contests_keep)

['United States Senator', 'Representative in Congress', 'United States Representative', 'Governor', 'Attorney General', 'Commissioner of Agriculture', 'State Representative', 'State Senator']


In [10]:
#Filter to only include results for contest types with statewide reach
frame = frame[frame['Contest Name'].isin(contests_keep)]

#Filter out the OverVotes and UnderVotes
filtered_frame = frame[~frame['Candidate/Retention/IssueName/WriteInsCast/OverVotes/UnderVotes'].isin(['OverVotes', 'UnderVotes'])]

#Clean up the precinct column
filtered_frame['Precinct Polling Location'] = filtered_frame['Precinct Polling Location'].astype(str)

#Make the precinct column at least 4 digits
filtered_frame["modified_pre"] = filtered_frame["Precinct"].astype(str).str.zfill(4)

#Make a column with the 3 letter county code and the precincts
filtered_frame["pct_std"]=filtered_frame['County Code (Three-character abbreviation)']+"-"+filtered_frame["modified_pre"]

filtered_frame.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame['Precinct Polling Location'] = filtered_frame['Precinct Polling Location'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame["modified_pre"] = filtered_frame["Precinct"].astype(str).str.zfill(4)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame["pc

Unnamed: 0,County Code (Three-character abbreviation),County Name,Election Number,Election Date,Election Name,Precinct,Precinct Polling Location,Total Registered Voters,Total Registered Republicans,Total Registered Democrats,...,Contest Name,District,Contest Code (Florida’s 6 digit contest codes),Candidate/Retention/IssueName/WriteInsCast/OverVotes/UnderVotes,Candidate Party (abbreviation),Candidate Florida Voter Registration System ID Number,DOE Assigned Candidate Number or Retention/Issue Number,Vote Total,modified_pre,pct_std
0,HOL,Holmes,,08/23/2022,2022 Primary Election,1,Ponce de Leon,1234,0,252,...,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,4,1,HOL-0001
1,HOL,Holmes,,08/23/2022,2022 Primary Election,2,Pine Log,1180,0,229,...,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,2,2,HOL-0002
2,HOL,Holmes,,08/23/2022,2022 Primary Election,3,New Hope,946,0,224,...,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,6,3,HOL-0003
3,HOL,Holmes,,08/23/2022,2022 Primary Election,4,Bethlehem,1582,0,297,...,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,4,4,HOL-0004
4,HOL,Holmes,,08/23/2022,2022 Primary Election,5,Gritney,623,0,117,...,United States Senator,,120002,Ricardo De La Fuente,DEM,0,82512,1,5,HOL-0005


In [11]:
#Cast the contest name and code columns to string to make the pivot col
filtered_frame["Contest Name"] = filtered_frame["Contest Name"].astype(str)
filtered_frame["Candidate/Retention/IssueName/WriteInsCast/OverVotes/UnderVotes"] = filtered_frame["Candidate/Retention/IssueName/WriteInsCast/OverVotes/UnderVotes"].astype(str)

#Fill in the blank candidate party with "WRI"
filtered_frame['Candidate Party (abbreviation)'] = filtered_frame['Candidate Party (abbreviation)'].map({' ':'WRI'}).fillna(filtered_frame['Candidate Party (abbreviation)'])

#Create the pivot column with the contest name and the candidate
filtered_frame["pivot_col"] = filtered_frame["Contest Name"] + "-:-" + filtered_frame["Candidate/Retention/IssueName/WriteInsCast/OverVotes/UnderVotes"] + "-:-" + "PARTY:" +filtered_frame['Candidate Party (abbreviation)']

#Where it exists, add in the district to the pivot column
filtered_frame["pivot_col"] = np.where(filtered_frame["District"]!=" ",filtered_frame["pivot_col"] + "-:-" + filtered_frame["District"], filtered_frame["pivot_col"])

#Removing all of the specific party registration data, as it looks like the state doesn't have that
for val in ['Total Registered Republicans','Total Registered Democrats', 'Total Registered All Other Parties']:
    print(filtered_frame[val].unique())

filtered_frame.drop(['Election Number', 'Election Date', 'Total Registered Republicans','Total Registered Democrats', 'Total Registered All Other Parties','Candidate Florida Voter Registration System ID Number','DOE Assigned Candidate Number or Retention/Issue Number'], axis = 1, inplace = True)

['0' '777' '808' ... '1703' '2896' '3401']
['252' '229' '224' ... '2069' '2154' '218']
['0']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame["Contest Name"] = filtered_frame["Contest Name"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_frame["Candidate/Retention/IssueName/WriteInsCast/OverVotes/UnderVotes"] = filtered_frame["Candidate/Retention/IssueName/WriteInsCast/OverVotes/UnderVotes"].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guid

In [12]:
#Checking how many write-in candidates there are
print(filtered_frame[filtered_frame['Candidate Party (abbreviation)'] == 'WRI']['Candidate/Retention/IssueName/WriteInsCast/OverVotes/UnderVotes'].unique())

#Checking how many parties overall
filtered_frame['Candidate Party (abbreviation)'].unique()

[]


array(['DEM', 'REP'], dtype=object)

#### Data pivot, more cleaning

In [13]:
#Pivot the data so that each row has all the results from that precinct
pivoted_2022 = pd.pivot_table(filtered_frame, values=['Vote Total'], index=["pct_std",'County Code (Three-character abbreviation)','County Name',"Precinct","Precinct Polling Location"],columns=['pivot_col'],aggfunc=sum)

#Clean up the indices
pivoted_2022.reset_index(inplace=True,drop=False)
pivoted_2022.columns = [' '.join(col).strip() for col in pivoted_2022.columns.values]
pivoted_2022.columns = pivoted_2022.columns.str.replace("Vote Total ","")
pivoted_2022 = pivoted_2022.fillna(0)

#Remove single quotes from all column names, part of cleaning candidate surnames like O'Brien
pivoted_2022.columns = [col.replace("'", "") for col in pivoted_2022.columns]

pivoted_2022.head()

Unnamed: 0,pct_std,County Code (Three-character abbreviation),County Name,Precinct,Precinct Polling Location,Attorney General-:-Aramis Ayala-:-PARTY:DEM,Attorney General-:-Daniel Uhlfelder-:-PARTY:DEM,Attorney General-:-Jim Lewis-:-PARTY:DEM,Commissioner of Agriculture-:-J. R. Gaillot-:-PARTY:DEM,Commissioner of Agriculture-:-James W. Shaw-:-PARTY:REP,...,State Senator-:-Richard Paul Dembinsky-:-PARTY:DEM-:- District 8,"State Senator-:-Shevrin ""Shev"" Jones-:-PARTY:DEM-:- District 34",State Senator-:-Steve Byers-:-PARTY:REP-:- District 26,State Senator-:-Tracie Davis-:-PARTY:DEM-:- District 5,State Senator-:-Travis Hutson-:-PARTY:REP-:- District 7,State Senator-:-William Wheelen-:-PARTY:REP-:- District 26,United States Senator-:-Brian Rush-:-PARTY:DEM,United States Senator-:-Ricardo De La Fuente-:-PARTY:DEM,United States Senator-:-Val Demings-:-PARTY:DEM,United States Senator-:-William Sanchez-:-PARTY:DEM
0,ALA-0001,ALA,Alachua,1,First Baptist Church of Waldo,54.0,49.0,51.0,41.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,20.0,11.0,120.0,10.0
1,ALA-0002,ALA,Alachua,2,LaCrosse Town Hall,186.0,60.0,90.0,98.0,99.0,...,0.0,0.0,0.0,0.0,0.0,0.0,52.0,12.0,263.0,22.0
2,ALA-0003,ALA,Alachua,3,Legacy Park,198.0,133.0,93.0,114.0,184.0,...,0.0,0.0,0.0,0.0,0.0,0.0,44.0,11.0,352.0,40.0
3,ALA-0004,ALA,Alachua,4,Alachua County Agr. and Equestrian Center,149.0,86.0,80.0,88.0,267.0,...,0.0,0.0,0.0,0.0,0.0,0.0,50.0,14.0,240.0,29.0
4,ALA-0005,ALA,Alachua,5,First Lutheran Church,213.0,210.0,51.0,157.0,19.0,...,0.0,0.0,0.0,0.0,0.0,0.0,18.0,12.0,439.0,36.0


#### Load helper functions

Useful for column renaming and dictionary creation

In [14]:
#Setting up function for assigning legislative abbreviations
def get_level_dist(column_name):
    zfill_level = 2
    if "Representative in Congress" in column_name or "United States Representative" in column_name:
        level = "CON"
    elif "State Senator" in column_name:
        level = "SU"
    elif "State Representative" in column_name:
        level = "SL"
        zfill_level = 3
    else:
        raise ValueError
    return_val = re.findall("District \S*",column_name)
    if (len(return_val)!=0):
        dist = return_val[0].split(" ")[1]
        dist = dist.zfill(zfill_level)
    else:
        raise ValueError
    return level,dist

In [15]:
#Set up contest level function
def get_race(contest):
    if "President" in contest:
        level = "PRE"
    elif ("Representative in Congress" in contest or "United States Representative" in contest or "State Senator" in contest or "State Representative" in contest):
        contest_info = get_level_dist(contest)
        level = contest_info[0]+contest_info[1]
    else:
        print(contest)
        raise ValueError
    return level

In [16]:
#Set up election type (general or primary) function
def get_election_type(contest):
    return "P"

In [17]:
#Set up contest three-letter descriptor function
def get_race(contest):
    mod_level = ""
    level = contest.split("-:-")[0]
    level_change_dict = {
        'Attorney General':'ATG',
        'Court of Appeals Judge':'CAJ',
        'Governor':'GOV',
        'State Controller':'CNT',
        'STATE QUESTION':'SQ',
        'Secretary of State':'SOS',
        'State Representative':'SL',
        'State Senator':'SU',
        'President':'PRE',
        'United States Senator':'USS',
        'Amendment':'A',
        'State Treasurer':'TRE',
        'Retention of':'SCJ',
        'Representative in Congress':'CON',
        'United States Representative':'CON',
        'Chief Financial Officer':'CFO',
        'Commissioner of Agriculture':'COA'}
    for val in level_change_dict.keys():
        if val in level:
            mod_level = level_change_dict[val]
            break
    if mod_level == "":
        print("NO CONTEST", contest)
    if mod_level == 'A':
        mod_level += level.split(":")[0].split(" ")[-1]

    return mod_level

In [18]:
#Set up party single-letter identifier function
def get_party(contest):
    if "Amendment" in contest:
        return ""
    elif "Retention of" in contest:
        return contest.split("-:-")[1][0].upper()
    elif "PARTY:DEM" in contest:
        return "D"
    elif "PARTY:REP" in contest:
        return "R"
    elif "PARTY:LPF" in contest:
        return "L"
    ## Reform -> F
    elif "PARTY:REF" in contest:
        return "O"
    elif "PARTY:PSL" in contest:
        return "S"
    elif "PARTY:GRE" in contest:
        return "G"
    elif "PARTY:CPF" in contest:
        return "C"
    elif "PARTY:WRI" in contest:
        return "O"
    elif "PARTY:NPA" or "PARTY:NOP" in contest:
        return "N"
    else:
        print(contest)
        return ValueError

In [19]:
#Set up function for first three letters of candidate's surname
def get_name(contest):
    if "No for Rejection" in contest:
        return "NO"
    elif "Yes for Approval" in contest:
        return "YES"

    if "Retention of" in contest:
        candidate = contest.split("-:-")[0]
        return candidate.upper().split(" ")[-1][0:3]
    else:
        candidate = contest.split("-:-")[1]
        if candidate == "None Of These Candidates":
            return "WRI"
        candidate = candidate.upper()
        if "Governor" in contest:
            likely_last = candidate.split(" ")[-1]
        else:
            likely_last = candidate.split(" ")[-1]
        if likely_last in ["JR","III","II","SR"]:
            return candidate.split(" ")[-2][0:3]
        else:
            return likely_last[0:3]

In [20]:
#Set up function for getting district string
def get_district(contest):
    district_string = contest.split("-:-")[3]
    district_string = district_string.strip()
    likely = district_string.split(" ")[1]
    if "Representative in Congress" in contest or "United States Representative" in contest or 'State Senator' in contest:
        level = 2
    elif 'State Representative' in contest:
        level = 3
    return likely.zfill(level)

#### Apply functions

Plus cleaning, dictionary setup

In [21]:
#Columns to leave out of helper function operations
keep_names = ['pct_std', 'County Code (Three-character abbreviation)', 'County Name',
       'Precinct', 'Precinct Polling Location']

In [22]:
#Applying functions to all columns except for ones in keep_names list
contest_name_change_dict = {}

duplicate_value_list = []
for contest in pivoted_2022.columns:
    if contest not in keep_names:
        # Add in a condition about the 20
#         print("E-type", get_election_type(contest))
#         print("Race", get_race(contest))
#         print("Party", get_party(contest))
#         print("Name", get_name(contest))
        if "Representative in Congress" in contest or "United States Representative" in contest or 'State Representative' in contest or 'State Senator' in contest:
            value = get_election_type(contest) + get_race(contest) + get_district(contest) + get_party(contest) + get_name(contest)
        else:
            value = get_election_type(contest) + "22" + get_race(contest) + get_party(contest) + get_name(contest)
        if value in contest_name_change_dict.values():
            duplicate_value_list.append(value)
        print(len(value))
        contest_name_change_dict[contest] = value


10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
9
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
9
9
9
9
9
9
9
9
9
9
9
9
9


In [23]:
#Checking for duplicate columns
if len(duplicate_value_list) > 0:
    print("DUPLICATE VALUES")
    for val in [(k,v) for k, v in contest_name_change_dict.items() if v in duplicate_value_list]:
        print(val)

DUPLICATE VALUES
('Representative in Congress-:-Alan Grayson-:-PARTY:DEM-:- District 10', 'PCON10DGRA')
('Representative in Congress-:-Terence R. Gray-:-PARTY:DEM-:- District 10', 'PCON10DGRA')


Manually writing column names for instances of:
- Same first three letters of last name for candidates in same contest
- Multiple word last names

In [24]:
#Clean up edge cases
contest_name_change_dict['Representative in Congress-:-Alan Grayson-:-PARTY:DEM-:- District 10'] = 'PCON10DAGR'
contest_name_change_dict['Representative in Congress-:-Terence R. Gray-:-PARTY:DEM-:- District 10'] = 'PCON10DTGR'
contest_name_change_dict['Representative in Congress-:-Darlene Cerezo Swaffar-:-PARTY:REP-:- District 23'] = 'PCON23RCER'
contest_name_change_dict['State Representative-:-Karen Gonzalez Pittman-:-PARTY:REP-:- District 65'] = 'PSL065RGON'
contest_name_change_dict['State Representative-:-LaVon Bracy Davis-:-PARTY:DEM-:- District 40'] = 'PSL040DBRA'
contest_name_change_dict['United States Senator-:-Ricardo De La Fuente-:-PARTY:DEM'] = 'P22USSDDEL'


In [25]:
#Rename the dataframe using this dictionary
pivoted_2022.rename(columns = contest_name_change_dict, inplace = True)

#Define a reversed version of the dictionary
contest_name_change_dict_rev = {v:k for k,v in contest_name_change_dict.items()}

In [26]:
#Add a COUNTYFP column
FIPS_dict = {'Alachua': '001',
 'Baker': '003',
 'Bay': '005',
 'Bradford': '007',
 'Brevard': '009',
 'Broward': '011',
 'Calhoun': '013',
 'Charlotte': '015',
 'Citrus': '017',
 'Clay': '019',
 'Collier': '021',
 'Columbia': '023',
 'Desoto': '027',
 'Dixie': '029',
 'Duval': '031',
 'Escambia': '033',
 'Flagler': '035',
 'Franklin': '037',
 'Gadsden': '039',
 'Gilchrist': '041',
 'Glades': '043',
 'Gulf': '045',
 'Hamilton': '047',
 'Hardee': '049',
 'Hendry': '051',
 'Hernando': '053',
 'Highlands': '055',
 'Hillsborough': '057',
 'Holmes': '059',
 'Indian River': '061',
 'Jackson': '063',
 'Jefferson': '065',
 'Lafayette': '067',
 'Lake': '069',
 'Lee': '071',
 'Leon': '073',
 'Levy': '075',
 'Liberty': '077',
 'Madison': '079',
 'Manatee': '081',
 'Marion': '083',
 'Martin': '085',
 'Miami-Dade': '086',
 'Monroe': '087',
 'Nassau': '089',
 'Okaloosa': '091',
 'Okeechobee': '093',
 'Orange': '095',
 'Osceola': '097',
 'Palm Beach': '099',
 'Pasco': '101',
 'Pinellas': '103',
 'Polk': '105',
 'Putnam': '107',
 'St. Johns': '109',
 'St. Lucie': '111',
 'Santa Rosa': '113',
 'Sarasota': '115',
 'Seminole': '117',
 'Sumter': '119',
 'Suwannee': '121',
 'Taylor': '123',
 'Union': '125',
 'Volusia': '127',
 'Wakulla': '129',
 'Walton': '131',
 'Washington': '133'}

pivoted_2022["COUNTYFP"] = pivoted_2022["County Name"].map(FIPS_dict).fillna(pivoted_2022["County Name"])
print(pivoted_2022["COUNTYFP"].unique())

['001' '003' '005' '007' '009' '011' '013' '015' '017' '019' '021' '023'
 '086' '027' '029' '031' '033' '035' '037' '039' '041' '043' '045' '047'
 '049' '051' '053' '055' '057' '059' '061' '063' '065' '067' '069' '071'
 '073' '075' '077' '079' '081' '087' '083' '085' '089' '091' '093' '095'
 '097' '099' '101' '103' '105' '107' '113' '115' '117' '109' '111' '119'
 '121' '123' '125' '127' '129' '131' '133']


In [27]:
#Additional cleaning
pivoted_2022.rename(columns = {"pct_std":"UNIQUE_ID"}, inplace = True)
pivoted_2022[pivoted_2022["UNIQUE_ID"].str.contains("nan")]
for col in list(contest_name_change_dict.values()):
    pivoted_2022[col] = pivoted_2022[col].astype(int)

## Checks against FL Dept. of State county-level data

#### Read in files, clean



In [28]:
#Read in alternative data source, in this case county-level data from FL Dept. of State
#Has data aggregated by county
county_totals = pd.read_csv("./raw-from-source/08232022Election.txt", encoding = "latin-1", sep="\t")

#Fitlering to relevant contests
county_totals = county_totals[county_totals["OfficeDesc"].isin(contests_keep)]

county_totals.Juris1num.unique()

array([  1.,   3.,   4.,   5.,   6.,   7.,   9.,  10.,  11.,  12.,  13.,
        14.,  15.,  16.,  18.,  19.,  21.,  22.,  23.,  24.,  25.,  26.,
        27.,  28.,  nan,   2.,  20.,  17.,  29.,  30.,  34.,  35.,  36.,
        37.,  38.,  39.,  42.,  43.,  45.,  50.,  51.,  52.,  53.,  55.,
        56.,  58.,  59.,  65.,  68.,  69.,  77.,  87.,  91., 106., 113.,
       118., 119., 120.,   8.,  40.,  41.,  44.,  47.,  62.,  92.,  93.,
        97.,  98.,  99., 101., 105., 107., 108., 109.])

In [29]:
#Make a string district column
county_totals["District"] = "District " + county_totals.Juris1num.astype(str)
county_totals["District"] = county_totals["District"].str.replace('District nan','').str.replace('\.0',"")

#Clean candidate last names
county_totals['CanNameLast'] = county_totals['CanNameLast'].str.replace(" ","")

#Combine the office, candidate first name and candidate last name, clean where all of that info isn't available
county_totals["cand_col"] = county_totals["OfficeDesc"] + "-:-" + county_totals["CanNameFirst"] + " " + county_totals["CanNameLast"] + "-:-" + "PARTY:" +county_totals["PartyCode"]

# Where it exists, add in the district to the pivot column
county_totals["cand_col"] = np.where(county_totals["District"]!="",county_totals["cand_col"] + "-:-" + county_totals["District"], county_totals["cand_col"])

#### Pivot, more cleaning

In [30]:
#Pivoting the data, reseting indices, and filling nulls
county_pivot = pd.pivot_table(county_totals, index = "CountyName", columns = "cand_col", values = "CanVotes", aggfunc = sum)
county_pivot.reset_index(inplace = True, drop = False)
county_pivot = county_pivot.fillna(0)

#Remove single quotes from all column names, part of cleaning candidate surnames like O'Brien
county_pivot.columns = [col.replace("'", "") for col in county_pivot.columns]

In [31]:
county_pivot.head()

Unnamed: 0,CountyName,Attorney General-:-Aramis Ayala-:-PARTY:DEM,Attorney General-:-Daniel Uhlfelder-:-PARTY:DEM,Attorney General-:-Jim Lewis-:-PARTY:DEM,Commissioner of Agriculture-:-J. R. Gaillot-:-PARTY:DEM,Commissioner of Agriculture-:-James Shaw-:-PARTY:REP,Commissioner of Agriculture-:-Naomi Blemur-:-PARTY:DEM,Commissioner of Agriculture-:-Ryan Morales-:-PARTY:DEM,Commissioner of Agriculture-:-Wilton Simpson-:-PARTY:REP,Governor-:-Cadance Daniel-:-PARTY:DEM,...,United States Representative-:-Tuan Le-:-PARTY:REP-:-District 10,United States Representative-:-Vern Buchanan-:-PARTY:REP-:-District 16,United States Representative-:-W. Trout-:-PARTY:DEM-:-District 23,United States Representative-:-Wendy Schmeling-:-PARTY:REP-:-District 18,United States Representative-:-William VanHorn-:-PARTY:DEM-:-District 15,United States Representative-:-Willie Montague-:-PARTY:REP-:-District 10,United States Senator-:-Brian Rush-:-PARTY:DEM,United States Senator-:-Ricardo DeLaFuente-:-PARTY:DEM,United States Senator-:-Val Demings-:-PARTY:DEM,United States Senator-:-William Sanchez-:-PARTY:DEM
0,Alachua,14545.0,8391.0,4755.0,8261.0,4815.0,11514.0,7253.0,10634.0,624.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2336.0,857.0,24002.0,2057.0
1,Baker,240.0,186.0,500.0,210.0,1765.0,367.0,341.0,2452.0,83.0,...,0.0,0.0,0.0,0.0,0.0,0.0,225.0,57.0,516.0,176.0
2,Bay,2518.0,1658.0,2218.0,1369.0,6404.0,2919.0,1998.0,13796.0,261.0,...,0.0,0.0,0.0,0.0,0.0,0.0,997.0,339.0,4565.0,640.0
3,Bradford,412.0,221.0,710.0,333.0,1375.0,581.0,411.0,2218.0,72.0,...,0.0,0.0,0.0,0.0,0.0,0.0,276.0,58.0,910.0,132.0
4,Brevard,20121.0,12502.0,11100.0,9795.0,17921.0,18810.0,14276.0,41905.0,1163.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1987.0,951.0,41956.0,1886.0


#### Apply functions

In [32]:
#Create dictionary for alt data source
contest_name_change_dict_counties = {}

#Checking for duplicate column names
duplicate_value_list = []
for contest in county_pivot.columns:
    if contest != "CountyName":
        #print(contest)
        #print("E-type", get_election_type(contest))
        #print("Race", get_race(contest))
        #print("Party", get_party(contest))
        #print("Name", get_name(contest))
        if "United States Representative" in contest or 'State Representative' in contest or 'State Senator' in contest:
            value = get_election_type(contest) + get_race(contest) + get_district(contest) + get_party(contest) + get_name(contest)
        else:
            value = get_election_type(contest) + "22" + get_race(contest) + get_party(contest) + get_name(contest)
        if value in contest_name_change_dict_counties.values():
            duplicate_value_list.append(value)
        print(len(value))
        contest_name_change_dict_counties[contest] = value

if len(duplicate_value_list) > 0:
    print("DUPLICATE VALUES")
    for val in [(k,v) for k, v in contest_name_change_dict_counties.items() if v in duplicate_value_list]:
        print(val)

10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10

In [33]:
#Clean up edge cases
contest_name_change_dict_counties['United States Representative-:-Alan Grayson-:-PARTY:DEM-:-District 10'] = 'PCON10DAGR'
contest_name_change_dict_counties['United States Representative-:-Terence Gray-:-PARTY:DEM-:-District 10'] = 'PCON10DTGR'

In [34]:
#Apply new column names using dictionary just created
county_pivot.rename(columns = contest_name_change_dict_counties, inplace = True)
contest_name_change_dict_counties_rev = {v:k for k,v in contest_name_change_dict_counties.items()}

#### Comparison to original results

In [35]:
#Checking out a list of contests that appear in county-level source but not precinct-level
{i:contest_name_change_dict_counties_rev[i] for i in list(county_pivot.columns) if i not in list(pivoted_2022.columns) and i !="CountyName"}

{}

In [36]:
#Checking out a list of contests that appear in precinct-level source but not county-level
ignore_list = ['UNIQUE_ID',
 'County Code (Three-character abbreviation)',
 'County Name','COUNTYFP','pct_std', 'County Code (Three-character abbreviation)', 'County Name',
       'Precinct', 'Precinct Polling Location', 'County', 'CNTY_CODE', 'CNTY_NAME', 'PREC_ID', 'POLL_LOC', 'Sum']

{i:contest_name_change_dict_rev[i] for i in list(pivoted_2022.columns) if i not in list(county_pivot.columns) and i not in ignore_list}

{}

In [37]:
#Define a list of columns that appear in both dataframes
shared_cols = [i for i in list(pivoted_2022.columns) if i in list(county_pivot.columns)]
print(shared_cols)

['P22ATGDAYA', 'P22ATGDUHL', 'P22ATGDLEW', 'P22COADGAI', 'P22COARSHA', 'P22COADBLE', 'P22COADMOR', 'P22COARSIM', 'P22GOVDDAN', 'P22GOVDCRI', 'P22GOVDFRI', 'P22GOVDWIL', 'PCON04RBEA', 'PCON09RMOR', 'PCON07DKRU', 'PCON10DAGR', 'PCON15DCOH', 'PCON07DPAS', 'PCON23DELL', 'PCON13RMAK', 'PCON27DMON', 'PCON20DOMP', 'PCON13RLUN', 'PCON27DTAD', 'PCON04DHIL', 'PCON07RSAB', 'PCON07RSAN', 'PCON23DSOR', 'PCON07RDUK', 'PCON21RMAS', 'PCON12RPER', 'PCON19RDON', 'PCON10RWIM', 'PCON25RSPA', 'PCON28RGIM', 'PCON28RGAR', 'PCON22RLAW', 'PCON15DRAM', 'PCON06RDAV', 'PCON12RLEI', 'PCON13RQUI', 'PCON14DBRA', 'PCON23RMCL', 'PCON10DBRO', 'PCON07RMIL', 'PCON20DHOL', 'PCON22RFRA', 'PCON08DDOD', 'PCON11RWEB', 'PCON03DHAW', 'PCON23RCER', 'PCON26RAQU', 'PCON25DSCH', 'PCON22RADE', 'PCON15RGRI', 'PCON15DGEL', 'PCON18RTAR', 'PCON04RAGU', 'PCON07RBEN', 'PCON27RPOL', 'PCON24DWIL', 'PCON15DBRO', 'PCON11RSOR', 'PCON01RMER', 'PCON12RBIL', 'PCON23DHOL', 'PCON07DFER', 'PCON23RWEI', 'PCON10DACH', 'PCON12RMAR', 'PCON15RTOL', 'PCON

In [38]:
#Same for unshared columns
unshared_cols = [i for i in list(pivoted_2022.columns) if i not in list(county_pivot.columns)]
print(unshared_cols)

['UNIQUE_ID', 'County Code (Three-character abbreviation)', 'County Name', 'Precinct', 'Precinct Polling Location', 'COUNTYFP']


In [39]:
#Defining statewide_totals_check & county_totals_check function

def statewide_totals_check(partner_df,source_df,column_list):
    """Compares the totals of two election result dataframes at the statewide total level

    Args:
      partner_df: DataFrame of election results we are comparing against
      source_df: DataFrame of election results we are comparing to
      column_list: List of races that there are votes for

    Returns:
      Nothing, only prints out an analysis
    """
    print("***Statewide Totals Check***")
    for race in column_list:
        if (partner_df[race].sum()- source_df[race].sum() != 0):
            print(race+" has a difference of "+str(partner_df[race].sum()-source_df[race].sum())+" votes")
            print("\PRECINCT-LEVEL: "+str(partner_df[race].sum())+" votes")
            print("\COUNTY-LEVEL: "+str(source_df[race].sum())+" votes")
        else:
            print(race + " is equal", "\tPRECINCT / COUNTY: " + str(partner_df[race].sum()))

def county_totals_check(partner_df,source_df,column_list,county_col,full_print=False):
    """Compares the totals of two election result dataframes at the county level

    Args:
      partner_df: DataFrame of election results we are comparing against
      source_df: DataFrame of election results we are comparing to
      column_list: List of races that there are votes for
      county_col: String of the column name that contains county information
      full_print: Boolean specifying whether to print out everything, including counties w/ similarities

    Returns:
      Nothing, only prints out an analysis
    """

    print("***Countywide Totals Check***")
    print("")
    diff_counties=[]
    for race in column_list:
        diff = partner_df.groupby([county_col]).sum()[race]-source_df.groupby([county_col]).sum()[race]
        for val in diff[diff != 0].index.values.tolist():
            if val not in diff_counties:
                diff_counties.append(val)
        if len(diff[diff != 0]!=0):
            print(race + " contains differences in these counties:")
            for val in diff[diff != 0].index.values.tolist():
                county_differences = diff[diff != 0]
                print("\t"+val+" has a difference of "+str(county_differences[val])+" votes")
                print("\t\PRECINCT-LEVEL: "+str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
                print("\t\COUNTY-LEVEL: "+str(source_df.groupby([county_col]).sum().loc[val,race])+" votes")
            if (full_print):
                for val in diff[diff == 0].index.values.tolist():
                    county_similarities = diff[diff == 0]
                    print("\t"+val + ": "+ str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
        else:
            print(race + " is equal across all counties")
            if (full_print):
                for val in diff[diff == 0].index.values.tolist():
                    county_similarities = diff[diff == 0]
                    print("\t"+val + ": "+ str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
    if (len(diff_counties)>0):
        print()
        print(diff_counties)



In [40]:
#Calling statewide function
statewide_totals_check(pivoted_2022, county_pivot, shared_cols)

***Statewide Totals Check***
P22ATGDAYA has a difference of -15.0 votes
\PRECINCT-LEVEL: 637841 votes
\COUNTY-LEVEL: 637856.0 votes
P22ATGDUHL has a difference of -14.0 votes
\PRECINCT-LEVEL: 399606 votes
\COUNTY-LEVEL: 399620.0 votes
P22ATGDLEW has a difference of -8.0 votes
\PRECINCT-LEVEL: 381567 votes
\COUNTY-LEVEL: 381575.0 votes
P22COADGAI has a difference of -9.0 votes
\PRECINCT-LEVEL: 268854 votes
\COUNTY-LEVEL: 268863.0 votes
P22COARSHA has a difference of -14.0 votes
\PRECINCT-LEVEL: 571943 votes
\COUNTY-LEVEL: 571957.0 votes
P22COADBLE has a difference of -16.0 votes
\PRECINCT-LEVEL: 699259 votes
\COUNTY-LEVEL: 699275.0 votes
P22COADMOR has a difference of -12.0 votes
\PRECINCT-LEVEL: 420597 votes
\COUNTY-LEVEL: 420609.0 votes
P22COARSIM has a difference of -26.0 votes
\PRECINCT-LEVEL: 1048749 votes
\COUNTY-LEVEL: 1048775.0 votes
P22GOVDDAN has a difference of -1.0 votes
\PRECINCT-LEVEL: 38197 votes
\COUNTY-LEVEL: 38198.0 votes
P22GOVDCRI has a difference of -25.0 votes
\PRE

Major differences between precinct- and county-level vote totals for Democratic US Senate primary. County-by-count check reveals source of these differences.
<br><br>
Differences for other contests are very small.


In [41]:
#Minor renaming for the county-level check
county_pivot.rename(columns = {"CountyName":"County Name"}, inplace = True)

In [42]:
#Calling county function
county_totals_check(pivoted_2022, county_pivot, shared_cols, "County Name")

***Countywide Totals Check***

P22ATGDAYA contains differences in these counties:
	Seminole has a difference of -15.0 votes
	\PRECINCT-LEVEL: 16321 votes
	\COUNTY-LEVEL: 16336.0 votes
P22ATGDUHL contains differences in these counties:
	Seminole has a difference of -14.0 votes
	\PRECINCT-LEVEL: 10378 votes
	\COUNTY-LEVEL: 10392.0 votes
P22ATGDLEW contains differences in these counties:
	Seminole has a difference of -8.0 votes
	\PRECINCT-LEVEL: 5829 votes
	\COUNTY-LEVEL: 5837.0 votes
P22COADGAI contains differences in these counties:
	Seminole has a difference of -9.0 votes
	\PRECINCT-LEVEL: 7239 votes
	\COUNTY-LEVEL: 7248.0 votes
P22COARSHA contains differences in these counties:
	Seminole has a difference of -14.0 votes
	\PRECINCT-LEVEL: 12378 votes
	\COUNTY-LEVEL: 12392.0 votes
P22COADBLE contains differences in these counties:
	Seminole has a difference of -16.0 votes
	\PRECINCT-LEVEL: 14372 votes
	\COUNTY-LEVEL: 14388.0 votes
P22COADMOR contains differences in these counties:
	Semin

PCON07REDW contains differences in these counties:
	Seminole has a difference of -4.0 votes
	\PRECINCT-LEVEL: 2788 votes
	\COUNTY-LEVEL: 2792.0 votes
PCON10DTGR is equal across all counties
PCON10DTAC is equal across all counties
PCON10RLOW is equal across all counties
PCON03DWEL is equal across all counties
PCON10RLE is equal across all counties
PCON16RBUC is equal across all counties
PCON23DTRO is equal across all counties
PCON18RSCH is equal across all counties
PCON15DVAN is equal across all counties
PCON10RMON is equal across all counties
PSL120DGEN is equal across all counties
PSL113RPER is equal across all counties
PSL113DDAM is equal across all counties
PSL002RAND is equal across all counties
PSL113DALT is equal across all counties
PSL036RPER is equal across all counties
PSL013DNIX is equal across all counties
PSL053RKOC is equal across all counties
PSL047DNIE is equal across all counties
PSL119RALV is equal across all counties
PSL109DGAN is equal across all counties
PSL058DFEN 

PSL041DROS is equal across all counties
PSL008DLET is equal across all counties
PSL093DALB is equal across all counties
PSL037RPLA contains differences in these counties:
	Seminole has a difference of -2.0 votes
	\PRECINCT-LEVEL: 4727 votes
	\COUNTY-LEVEL: 4729.0 votes
PSL035DMUN is equal across all counties
PSL025RYAR is equal across all counties
PSL093DVAL is equal across all counties
PSL077RESP is equal across all counties
PSL023RCLO is equal across all counties
PSL101DDEL is equal across all counties
PSL035DKEE is equal across all counties
PSL025RVAI is equal across all counties
PSL041DMCC is equal across all counties
PSL022RAPP is equal across all counties
PSL005RCOL is equal across all counties
PSL045RFRA is equal across all counties
PSL113RLOP is equal across all counties
PSL107DFRA is equal across all counties
PSL029RBAR is equal across all counties
PSL062DNEW is equal across all counties
PSL051ROLS is equal across all counties
PSU08DWIL is equal across all counties
PSU35DSHA i

Precinct-level results are missing Democratic US Senate primary results in Walton County.
<br><br>
Seminole County has numerous small differences. The reason for this is unclear.
<br><br>
Palm Beach, Clay, Nassau and Volusia counties have differences of less than three votes in one contest each.
<br><br>
For [Nassau County](https://enr.electionsfl.org/NAS/3204/Precincts/), the discrepancy is due to a recount that included PCON04DHOL added 1 vote to the candidate's total. That may explain the discrepancy for the same candidate in Clay County. 
<br><br>
Recounts in Palm Beach and Volusia counties also appear responsible for the PCON22RFRA and PSL029RFET discrepancies, respectively.
<br><br>
We also note that in Palm Beach County, the state includes a precinct number 9899. The county appears to label that same precinct as 9902.

## Adding missing Walton County data

In [43]:
#Read in file from Walton County
walton = pd.read_csv("./raw-from-source/walton.csv")

In [44]:
#Merge split precincts
walton = walton.groupby(['Precinct Name', 'Candidate Issue'])['Total Votes'].sum().reset_index()

In [45]:
#Set up precinct ID to map to pivoted dataframe
walton['prec_num'] = walton['Precinct Name'].astype(str).str.zfill(4)
walton['UNIQUE_ID'] = "WAL-"+ walton.prec_num
walton.head()

Unnamed: 0,Precinct Name,Candidate Issue,Total Votes,prec_num,UNIQUE_ID
0,110,Aramis Ayala,10,110,WAL-0110
1,110,Brian Rush,20,110,WAL-0110
2,110,Cadance Daniel,6,110,WAL-0110
3,110,Charles Galloway,134,110,WAL-0110
4,110,Charlie Crist,25,110,WAL-0110


In [46]:
#Set up cand_col IDs in Walton dataframe to map pivoted dataframe columns
wal_cands = {'Brian Rush': 'P22USSDRUS',
    'Ricardo De La Fuente': 'P22USSDDEL',
    'Val Demings': 'P22USSDDEM',
    'William Sanchez': 'P22USSDSAN'}

walton['cand_col'] = walton['Candidate Issue'].map(wal_cands)

walton.cand_col.unique()

array([nan, 'P22USSDRUS', 'P22USSDDEL', 'P22USSDDEM', 'P22USSDSAN'],
      dtype=object)

In [47]:
#Preview data using Val Demings as candidate
walton[walton['cand_col']=='P22USSDDEM'].head()

Unnamed: 0,Precinct Name,Candidate Issue,Total Votes,prec_num,UNIQUE_ID,cand_col
33,110,Val Demings,12,110,WAL-0110,P22USSDDEM
72,120,Val Demings,8,120,WAL-0120,P22USSDDEM
111,130,Val Demings,21,130,WAL-0130,P22USSDDEM
150,210,Val Demings,5,210,WAL-0210,P22USSDDEM
189,220,Val Demings,9,220,WAL-0220,P22USSDDEM


In [48]:
#Create function to map the data
def map_walton_to_pivoted(walton_df, pivot):
    
    #Set the index of the pivoted DataFrame to "UNIQUE_ID"
    pivot.set_index('UNIQUE_ID', inplace=True)
    
    #Iterate through the rows of the Walton DataFrame
    for index, row in walton_df.iterrows():
        cand = row['cand_col']
        prec_id = row['UNIQUE_ID']
        total_votes = row['Total Votes']

        #Check if the candidate column exists in the pivoted DataFrame
        if cand in pivot.columns:
            
           #Populate the cell with the Total Votes value
           pivot.at[prec_id, cand] = total_votes

    pivot = pivot.reset_index()
    return pivot

In [49]:
#Apply the function
pivoted_2022 = map_walton_to_pivoted(walton, pivoted_2022)

#Look at data
display(pivoted_2022[pivoted_2022['County Name'] =='Walton'])

Unnamed: 0,UNIQUE_ID,County Code (Three-character abbreviation),County Name,Precinct,Precinct Polling Location,P22ATGDAYA,P22ATGDUHL,P22ATGDLEW,P22COADGAI,P22COARSHA,...,PSU34DJON,PSU26RBYE,PSU05DDAV,PSU07RHUT,PSU26RWHE,P22USSDRUS,P22USSDDEL,P22USSDDEM,P22USSDSAN,COUNTYFP
5952,WAL-0110,WAL,Walton,110,110,10,12,20,10,132,...,0,0,0,0,0,20,5,12,6,131
5953,WAL-0120,WAL,Walton,120,120,6,6,8,6,76,...,0,0,0,0,0,6,1,8,4,131
5954,WAL-0130,WAL,Walton,130,130,8,18,12,12,183,...,0,0,0,0,0,7,2,21,8,131
5955,WAL-0210,WAL,Walton,210,210,0,0,6,2,53,...,0,0,0,0,0,1,0,5,2,131
5956,WAL-0220,WAL,Walton,220,220,6,2,8,3,91,...,0,0,0,0,0,7,0,9,1,131
5957,WAL-0230,WAL,Walton,230,230,3,10,16,8,135,...,0,0,0,0,0,10,0,13,5,131
5958,WAL-0240,WAL,Walton,240,240,4,2,6,4,56,...,0,0,0,0,0,1,1,8,2,131
5959,WAL-0310,WAL,Walton,310,310,70,73,71,43,462,...,0,0,0,0,0,44,15,128,31,131
5960,WAL-0320,WAL,Walton,320,320,37,39,36,21,170,...,0,0,0,0,0,14,8,81,12,131
5961,WAL-0330,WAL,Walton,330,330,9,14,21,8,154,...,0,0,0,0,0,9,2,23,12,131


## Checking totals again

In [50]:
#Calling statewide function
statewide_totals_check(pivoted_2022, county_pivot, shared_cols)

***Statewide Totals Check***
P22ATGDAYA has a difference of -15.0 votes
\PRECINCT-LEVEL: 637841 votes
\COUNTY-LEVEL: 637856.0 votes
P22ATGDUHL has a difference of -14.0 votes
\PRECINCT-LEVEL: 399606 votes
\COUNTY-LEVEL: 399620.0 votes
P22ATGDLEW has a difference of -8.0 votes
\PRECINCT-LEVEL: 381567 votes
\COUNTY-LEVEL: 381575.0 votes
P22COADGAI has a difference of -9.0 votes
\PRECINCT-LEVEL: 268854 votes
\COUNTY-LEVEL: 268863.0 votes
P22COARSHA has a difference of -14.0 votes
\PRECINCT-LEVEL: 571943 votes
\COUNTY-LEVEL: 571957.0 votes
P22COADBLE has a difference of -16.0 votes
\PRECINCT-LEVEL: 699259 votes
\COUNTY-LEVEL: 699275.0 votes
P22COADMOR has a difference of -12.0 votes
\PRECINCT-LEVEL: 420597 votes
\COUNTY-LEVEL: 420609.0 votes
P22COARSIM has a difference of -26.0 votes
\PRECINCT-LEVEL: 1048749 votes
\COUNTY-LEVEL: 1048775.0 votes
P22GOVDDAN has a difference of -1.0 votes
\PRECINCT-LEVEL: 38197 votes
\COUNTY-LEVEL: 38198.0 votes
P22GOVDCRI has a difference of -25.0 votes
\PRE

No more discrepancies in Walton County between precinct- and county-level vote totals for Democratic US Senate primary

In [51]:
#Calling county function
county_totals_check(pivoted_2022, county_pivot, shared_cols, "County Name")

***Countywide Totals Check***

P22ATGDAYA contains differences in these counties:
	Seminole has a difference of -15.0 votes
	\PRECINCT-LEVEL: 16321 votes
	\COUNTY-LEVEL: 16336.0 votes
P22ATGDUHL contains differences in these counties:
	Seminole has a difference of -14.0 votes
	\PRECINCT-LEVEL: 10378 votes
	\COUNTY-LEVEL: 10392.0 votes
P22ATGDLEW contains differences in these counties:
	Seminole has a difference of -8.0 votes
	\PRECINCT-LEVEL: 5829 votes
	\COUNTY-LEVEL: 5837.0 votes
P22COADGAI contains differences in these counties:
	Seminole has a difference of -9.0 votes
	\PRECINCT-LEVEL: 7239 votes
	\COUNTY-LEVEL: 7248.0 votes
P22COARSHA contains differences in these counties:
	Seminole has a difference of -14.0 votes
	\PRECINCT-LEVEL: 12378 votes
	\COUNTY-LEVEL: 12392.0 votes
P22COADBLE contains differences in these counties:
	Seminole has a difference of -16.0 votes
	\PRECINCT-LEVEL: 14372 votes
	\COUNTY-LEVEL: 14388.0 votes
P22COADMOR contains differences in these counties:
	Semin

PCON12RPRE is equal across all counties
PCON23RCHE is equal across all counties
PCON07REDW contains differences in these counties:
	Seminole has a difference of -4.0 votes
	\PRECINCT-LEVEL: 2788 votes
	\COUNTY-LEVEL: 2792.0 votes
PCON10DTGR is equal across all counties
PCON10DTAC is equal across all counties
PCON10RLOW is equal across all counties
PCON03DWEL is equal across all counties
PCON10RLE is equal across all counties
PCON16RBUC is equal across all counties
PCON23DTRO is equal across all counties
PCON18RSCH is equal across all counties
PCON15DVAN is equal across all counties
PCON10RMON is equal across all counties
PSL120DGEN is equal across all counties
PSL113RPER is equal across all counties
PSL113DDAM is equal across all counties
PSL002RAND is equal across all counties
PSL113DALT is equal across all counties
PSL036RPER is equal across all counties
PSL013DNIX is equal across all counties
PSL053RKOC is equal across all counties
PSL047DNIE is equal across all counties
PSL119RALV 

PSL035DKEE is equal across all counties
PSL025RVAI is equal across all counties
PSL041DMCC is equal across all counties
PSL022RAPP is equal across all counties
PSL005RCOL is equal across all counties
PSL045RFRA is equal across all counties
PSL113RLOP is equal across all counties
PSL107DFRA is equal across all counties
PSL029RBAR is equal across all counties
PSL062DNEW is equal across all counties
PSL051ROLS is equal across all counties
PSU08DWIL is equal across all counties
PSU35DSHA is equal across all counties
PSU10RCHA contains differences in these counties:
	Seminole has a difference of -7.0 votes
	\PRECINCT-LEVEL: 5799 votes
	\COUNTY-LEVEL: 5806.0 votes
PSU01RBRO is equal across all counties
PSU34DIGH is equal across all counties
PSU15DTHO is equal across all counties
PSU07RJAM is equal across all counties
PSU10RBRO contains differences in these counties:
	Seminole has a difference of -32.0 votes
	\PRECINCT-LEVEL: 33303 votes
	\COUNTY-LEVEL: 33335.0 votes
PSU02RTRU is equal across

## Additional checks, minor cleaning

In [52]:
pivoted_2022.rename(columns = {'Precinct':'PREC_ID',
                               'County Name':'CNTY_NAME',
                              'County Code (Three-character abbreviation)':'CNTY_CODE',
                              'Precinct Polling Location':'POLL_LOC'}, inplace = True)

In [53]:
pivoted_2022 = pivoted_2022[['UNIQUE_ID', 'COUNTYFP', 'CNTY_CODE', 'CNTY_NAME', 'PREC_ID',
       'POLL_LOC']+list(contest_name_change_dict.values())]

In [54]:
pivoted_2022.loc[pivoted_2022["POLL_LOC"]=="nan","POLL_LOC"] = "No Location Provided"

In [55]:
#Looking for nonphysical precincts
#These would require vote allocation
nonphysical_prec = pivoted_2022[pivoted_2022.POLL_LOC.str.contains("Early|VBM|Election Day")]
nonphysical_prec

Unnamed: 0,UNIQUE_ID,COUNTYFP,CNTY_CODE,CNTY_NAME,PREC_ID,POLL_LOC,P22ATGDAYA,P22ATGDUHL,P22ATGDLEW,P22COADGAI,...,PSU08DDEM,PSU34DJON,PSU26RBYE,PSU05DDAV,PSU07RHUT,PSU26RWHE,P22USSDRUS,P22USSDDEL,P22USSDDEM,P22USSDSAN
5613,SEM-1001,117,SEM,Seminole,1001,EVCA Early Voting Casselberry,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5614,SEM-1002,117,SEM,Seminole,1002,EVLM Early Voting Lake Mary,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5615,SEM-1003,117,SEM,Seminole,1003,EVOV Early Voting Oviedo Aquat,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5616,SEM-1004,117,SEM,Seminole,1004,EVOVN Early Voting Oviedo N,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5617,SEM-1005,117,SEM,Seminole,1005,EVSA Early Voting Sanford,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5618,SEM-1006,117,SEM,Seminole,1006,EVSOE Early Voting Elections O,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5619,SEM-1007,117,SEM,Seminole,1007,EVWK Early Voting Wekiva,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5620,SEM-2001,117,SEM,Seminole,2001,Early Voting,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5621,SEM-2002,117,SEM,Seminole,2002,Election Day,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5622,SEM-2003,117,SEM,Seminole,2003,VBM,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
#Check vote totals in nonphysical precincts
names = ['UNIQUE_ID','COUNTYFP','CNTY_CODE','CNTY_NAME','PREC_ID','POLL_LOC']

#Visually inspect
for col in nonphysical_prec.columns:
  if col not in names:
    print(nonphysical_prec[col].unique())

[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]
[0]


In [57]:
#Vote totals are zero. Removing these non-vote, non-physical precincts
pivoted_2022 = pivoted_2022[~pivoted_2022['POLL_LOC'].str.contains('Early|VBM|Election Day')]

## Export

CSV file and column dictionary for README file

In [58]:
#Re-order columns to place larger, statewide contests in front

#Set the order
base  = []
uss = []
other_statewide = []
cong = []
slu = []
sll = []
for i in pivoted_2022:
    if i.startswith('P22USS'):
        uss.append(i)
    elif i.startswith('P22'):
        other_statewide.append(i)
    elif i.startswith('PCON'):
        cong.append(i)
    elif i.startswith('PSU'):
        slu.append(i)
    elif i.startswith('PSL'):
        sll.append(i)
    else:
        base.append(i)
new_order = base + uss + other_statewide + cong + slu + sll
dict_order = uss + other_statewide + cong + slu + sll

print(base)

#Apply order to dataframe
pivoted_ordered_2022 = pivoted_2022.reindex(columns=new_order)
pivoted_ordered_2022.head()

['UNIQUE_ID', 'COUNTYFP', 'CNTY_CODE', 'CNTY_NAME', 'PREC_ID', 'POLL_LOC']


Unnamed: 0,UNIQUE_ID,COUNTYFP,CNTY_CODE,CNTY_NAME,PREC_ID,POLL_LOC,P22USSDRUS,P22USSDDEL,P22USSDDEM,P22USSDSAN,...,PSL025RVAI,PSL041DMCC,PSL022RAPP,PSL005RCOL,PSL045RFRA,PSL113RLOP,PSL107DFRA,PSL029RBAR,PSL062DNEW,PSL051ROLS
0,ALA-0001,1,ALA,Alachua,1,First Baptist Church of Waldo,20,11,120,10,...,0,0,0,0,0,0,0,0,0,0
1,ALA-0002,1,ALA,Alachua,2,LaCrosse Town Hall,52,12,263,22,...,0,0,0,0,0,0,0,0,0,0
2,ALA-0003,1,ALA,Alachua,3,Legacy Park,44,11,352,40,...,0,0,0,0,0,0,0,0,0,0
3,ALA-0004,1,ALA,Alachua,4,Alachua County Agr. and Equestrian Center,50,14,240,29,...,0,0,271,0,0,0,0,0,0,0
4,ALA-0005,1,ALA,Alachua,5,First Lutheran Church,18,12,439,36,...,0,0,20,0,0,0,0,0,0,0


In [59]:
#Create a sorted dictionary for use in README file
sorted_contest_name_change_dict = {col: contest_name_change_dict_rev[col] for col in dict_order}
print(sorted_contest_name_change_dict)

#Export the dictionary for the README
holder = pd.DataFrame(sorted_contest_name_change_dict.values(),sorted_contest_name_change_dict.keys())
holder.to_csv("./field_names.csv", index = True)

{'P22USSDRUS': 'United States Senator-:-Brian Rush-:-PARTY:DEM', 'P22USSDDEL': 'United States Senator-:-Ricardo De La Fuente-:-PARTY:DEM', 'P22USSDDEM': 'United States Senator-:-Val Demings-:-PARTY:DEM', 'P22USSDSAN': 'United States Senator-:-William Sanchez-:-PARTY:DEM', 'P22ATGDAYA': 'Attorney General-:-Aramis Ayala-:-PARTY:DEM', 'P22ATGDUHL': 'Attorney General-:-Daniel Uhlfelder-:-PARTY:DEM', 'P22ATGDLEW': 'Attorney General-:-Jim Lewis-:-PARTY:DEM', 'P22COADGAI': 'Commissioner of Agriculture-:-J. R. Gaillot-:-PARTY:DEM', 'P22COARSHA': 'Commissioner of Agriculture-:-James W. Shaw-:-PARTY:REP', 'P22COADBLE': 'Commissioner of Agriculture-:-Naomi Esther Blemur-:-PARTY:DEM', 'P22COADMOR': 'Commissioner of Agriculture-:-Ryan Morales-:-PARTY:DEM', 'P22COARSIM': 'Commissioner of Agriculture-:-Wilton Simpson-:-PARTY:REP', 'P22GOVDDAN': 'Governor-:-Cadance Daniel-:-PARTY:DEM', 'P22GOVDCRI': 'Governor-:-Charlie Crist-:-PARTY:DEM', 'P22GOVDFRI': 'Governor-:-Nicole "Nikki" Fried-:-PARTY:DEM', 'P

In [60]:
#Export final CSV
output = os.path.join(wd,'output')
fl_2022_prim_prec = os.path.join(output,'fl_2022_prim_prec')
if not os.path.exists(output):
    os.mkdir(output)
if not os.path.exists(fl_2022_prim_prec):
    os.mkdir(fl_2022_prim_prec)

pivoted_ordered_2022.to_csv("./output/fl_2022_prim_prec/fl_2022_prim_prec.csv", index = False)