### Georgia 2022 Primary Runoff Election Returns

#### Sections
- <a href="#ETL">Cleaning Precinct-Level Election Results</a><br>
- <a href="#check">Vote Totals Checks</a><br>
- <a href="#readme">Creating README</a><br>
- <a href="#exp">Exporting Cleaned Precinct-Level Dataset</a><br>


#### Sources
Precint-level data from [Georgia Secretary of State Certified Results by County - XML format](https://results.enr.clarityelections.com/GA/114891/web.285569/#/reporting).

Statewide county-level data to run checks from [Georgia Secretary of State Statewide Summary Report - CSV format](https://results.enr.clarityelections.com//GA//114891/300218/reports/summary.zip)
    
County-level data to run checks from [Georgia Secretary of State County Summary Report - XML format](https://results.enr.clarityelections.com//GA//114891/300218/reports/detailxml.zip)

In [1]:
import geopandas as gp
import pandas as pd
import os
import xml.etree.ElementTree as et
import numpy as np
import re
import GA22_primaryrunoff_helper as hlp
pd.set_option('display.max_rows', None)

<p><a name="ETL"></a></p>

### Cleaning Precint Level Election Results

In [2]:
# Read in precint level election results
df_primary_runoff = hlp.ph_clarityelec_xml("./raw-from-source/counties/", 'primary runoff')

In [3]:
#concatenate df, make sure all counties are read in
ga_22_primary_runoff = pd.concat([df_primary_runoff])
print(len(ga_22_primary_runoff["county"].unique()) == 159) #159 counties in GA

True


In [4]:
#Check df dtypes, set vote column type to int
# ga_22_primary.info()
ga_22_primary_runoff.num_votes = ga_22_primary_runoff.num_votes.astype(int)
ga_22_primary_runoff.num_votes.dtype

dtype('int32')

#### Subset dataframe by contests with statewide reach

In [5]:
keywords_list = ['US Senate', 'US House', 'Governor', 'Attorney General', 'Secretary of State', 'State School Superintendent', 'Agriculture', 'Labor', 'Insurance', 'PSC', 'Public Service', 'Supreme Court', 'Court of Appeals', 'State Senate', 'State House']

In [6]:
all_contests = ga_22_primary_runoff['contest'].unique().tolist()
keep_contests = hlp.contests_to_keep(all_contests, keywords_list)
keep_contests.sort()
print('Unique contests in dataset: '+ str(len(all_contests)))
print('Contests to keep: ' + str(len(keep_contests)))

Unique contests in dataset: 89
Contests to keep: 35


In [7]:
# #check remaining contests visually
# other_contests = set(all_contests) - set(keep_contests)
# other_contests

In [8]:
#subset dataframe by contests of interest only
ga_22_primr_sw = ga_22_primary_runoff[ga_22_primary_runoff['contest'].isin(keep_contests)].copy()

#### Standardize contest names

In [9]:
#Create df with unique contest names
contests = pd.DataFrame(keep_contests)
contests.columns = ['orig']

Visually inspect contest name variations, create functions to standardize contest type

In [10]:
# set(keep_contests)

In [11]:
#Functions to split contest string into office type, district, and party
contest_types = ['Attorney General', 'Commissioner of Agriculture', 'Commissioner of Insurance', 'Commissioner of Labor', 'Court of Appeals', 'Governor', 'Lieutenant Governor', 'Liutenant Governor', 'Public Service Commissioner', 'Secretary of State', 'State House', 'State School Superintendent', 'State Senate', 'US House', 'Supreme Court', 'US Senate','PSC']
def stnd_race(contest_string):
    standardized_race = ''
    for i in contest_types:
        if i in contest_string:
            standardized_race = i      
    return standardized_race

def stnd_dist(race_string):
    race_string = race_string.lower()
    if '/' in race_string:
        race_string = race_string.split('/')[0]
    if any(word in race_string for word in ['us house', 'state senate', 'public service commissioner', 'psc']):
        district = ''.join(filter(str.isdigit, race_string))
    elif 'state house' in race_string:
        district = ''.join(filter(str.isdigit, race_string))
    else:
        district = ''
    return district

def get_party(race_string):
    if "dem" in race_string.lower():
        return "Dem"
    elif "rep" in race_string.lower():
        return "Rep"
    elif "supreme court" in race_string.lower() or "court of appeals" in race_string.lower():
        return "Nonpartisan"

In [12]:
# Create new columns
contests['orig'] = contests['orig'].apply(lambda x: x.strip())
contests['party'] = contests['orig'].apply(lambda x: get_party(x))
contests['race'] = contests['orig'].apply(lambda x: stnd_race(x))
contests['district'] = contests['orig'].apply(lambda x: stnd_dist(x))

In [13]:
#manually correct nulls
contests.at[4, 'race'] = 'Commissioner of Insurance'
contests.at[5, 'race'] = 'Commissioner of Labor'

In [14]:
#Create standardized contest names for non- districted contests
non_districted = contests[contests['district'] == ''].copy()
non_districted['new_race'] = non_districted['race'] + ' - ' + non_districted['party']
#Create standardized contest names for districted contests
districted = contests[contests['district'] != ''].copy()
districted['new_race'] = districted['race'] + ' - District '+ districted['district'] + ' - ' + districted['party']
#Combine into one DF
contests_stnd = pd.concat([non_districted, districted], axis=0)

In [15]:
#check new df has same number of unique contests as the original df
contests_stnd.shape[0] == contests.shape[0]

True

In [16]:
#Create dictionary of standardized contest names
contests_stnd_dict = dict(zip(contests_stnd['orig'], contests_stnd['new_race']))

In [17]:
#Apply dict to standardize contests in precint election returns df
ga_22_primr_sw['stnd_contest'] = ga_22_primr_sw['contest'].map(contests_stnd_dict)

print('Number of original "unique" contests: ', ga_22_primr_sw['contest'].nunique())
print('Number of standardized contests: ', ga_22_primr_sw['stnd_contest'].nunique())

Number of original "unique" contests:  35
Number of standardized contests:  21


#### Add FIPS Column, Create Unique ID Column

In [18]:
#Add FIPS col to precinct df
ga_22_primr_sw = hlp.create_fips_col("./raw-from-source/FIPS/US_FIPS_Codes.csv", 'Georgia', ga_22_primr_sw, 'county')
#Create UNIQUE_ID col
ga_22_primr_sw['UNIQUE_ID'] = ga_22_primr_sw['COUNTYFP'] + '-' +ga_22_primr_sw['precinct']
#Check for 2707 precincts
ga_22_primr_sw['UNIQUE_ID'].nunique()

2707

#### Clean columns, create pivot column

In [19]:
#Remove incumbent status from candidate name
ga_22_primr_sw['choice'] = ga_22_primr_sw['choice'].apply(lambda x: x.replace('(I)', ''))

In [20]:
#create pivot col for precinct df
ga_22_primr_sw = hlp.create_pivot_col(ga_22_primr_sw, 'choice', 'stnd_contest', 'pivot')

In [21]:
#Check
ga_22_primr_sw.head(2)

Unnamed: 0,county,contest,choice,voting_method,precinct,num_votes,election,stnd_contest,COUNTYFP,UNIQUE_ID,pivot
0,Appling,Lieutenant Governor - Dem,Charlie Bailey,Absentee by Mail Votes,1B,1,primary runoff,Lieutenant Governor - Dem,1,001-1B,Charlie Bailey -:- Lieutenant Governor - Dem
1,Appling,Lieutenant Governor - Dem,Charlie Bailey,Absentee by Mail Votes,1C,0,primary runoff,Lieutenant Governor - Dem,1,001-1C,Charlie Bailey -:- Lieutenant Governor - Dem


#### Pivot Data

In [22]:
ga_22_primr_sw_pvt =pd.pivot_table(ga_22_primr_sw,index=["UNIQUE_ID","county","COUNTYFP","precinct"],columns=["pivot"],values=['num_votes'],aggfunc=sum).fillna(0)

In [23]:
#Clean up indexing
ga_22_primr_sw_pvt.columns = ga_22_primr_sw_pvt.columns.droplevel(0)
ga_22_primr_sw_pvt.reset_index(inplace = True)

In [24]:
#check
ga_22_primr_sw_pvt.head(1)

pivot,UNIQUE_ID,county,COUNTYFP,precinct,Bee Nguyen -:- Secretary of State - Dem,Betsy Kramer -:- State House - District 50 - Rep,Bob Duncan -:- State House - District 179 - Rep,Brent Cox -:- State House - District 28 - Rep,Carter Barrett -:- State House - District 24 - Rep,Charlie Bailey -:- Lieutenant Governor - Dem,...,Rick Townsend -:- State House - District 179 - Rep,Roger Bruce -:- State House - District 61 - Dem,Saira Draper -:- State House - District 90 - Dem,Sheri Smallwood Gilligan -:- State House - District 24 - Rep,Tabitha Johnson-Green -:- US House - District 10 - Dem,Terry Cummings -:- State House - District 39 - Dem,Vernon Jones -:- US House - District 10 - Rep,Wade Herring -:- US House - District 1 - Dem,Whitney Pimentel -:- State House - District 30 - Rep,William Will Boddie Jr -:- Commissioner of Labor - Dem
0,001-1B,Appling,1,1B,16.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,10.0


In [25]:
#Create dictionaries for renaming pivot col to VEST
exclude_columns = ['UNIQUE_ID', 'county', 'COUNTYFP', 'precinct']
contest_updates_dict, contest_updates_reversed, clean_dups = hlp.create_column_rename_dicts(ga_22_primr_sw_pvt, exclude_columns)

In [26]:
#Check all dict values under 10 characters
for item in contest_updates_dict.values():
    if len(item) > 10 or len(item) < 7:
        print(item)
        print(contest_updates_reversed[item])

In [27]:
#apply rename dictionary to pivoted df
ga_22_primr_sw_pvt.rename(columns = contest_updates_dict, inplace = True)
ga_22_primr_sw_pvt.reset_index(inplace = True, drop = True)

In [28]:
#set columns with votes as integer type
for item in contest_updates_dict.values():
    ga_22_primr_sw_pvt[item] = ga_22_primr_sw_pvt[item].astype(int)

In [29]:
#check
ga_22_primr_sw_pvt.head(2)

pivot,UNIQUE_ID,county,COUNTYFP,precinct,R22SOSDNGU,RSL050RKRA,RSL179RDUN,RSL028RCOX,RSL024RBAR,R22LTGDBAI,...,RSL179RTOW,RSL061DBRU,RSL090DDRA,RSL024RGIL,RCON10DJOH,RSL039DCUM,RCON10RJON,RCON01DHER,RSL030RPIM,R22LABDBOD
0,001-1B,Appling,1,1B,16,0,0,0,0,8,...,0,0,0,0,0,0,0,11,0,10
1,001-1C,Appling,1,1C,1,0,0,0,0,4,...,0,0,0,0,0,0,0,3,0,2


In [30]:
#Create list of candidates in precinct level df
precincts_cols = ga_22_primr_sw_pvt.columns[4:].to_list()
len(precincts_cols) #Number of candidates in precint-level results, useful to compare in vote totals checking

42

<p><a name="check"></a></p>

## Vote Totals Check
### Statewide

In [31]:
#Read in statewide csv summary file, with county level data
combined_sos_totals = pd.read_csv("./raw-from-source/summary/summary.csv")

In [32]:
#Remove 'Vote for 1'
combined_sos_totals['contest name'] = combined_sos_totals['contest name'].apply(lambda x: str(x).strip().replace('(Vote For 1)', ''))

In [33]:
#Create list of contests statewide
sos_all_contests = list(combined_sos_totals['contest name'].unique())
sos_keep_contests = hlp.contests_to_keep(sos_all_contests, keywords_list)
print('All contests:'+ str(len(sos_all_contests)))
print('Subset of contests to keep:' + str(len(sos_keep_contests)))

All contests:23
Subset of contests to keep:21


In [34]:
# #check remaining contests
# sos_other_contests = set(sos_all_contests) - set(sos_keep_contests)
# sos_other_contests

In [35]:
#Subset statewide results by contests of interest only
filtered_sos_totals = combined_sos_totals[combined_sos_totals['contest name'].isin(sos_keep_contests)].copy()

In [36]:
#Create pivot column
sos_totals = hlp.create_pivot_col(filtered_sos_totals, 'choice name', 'contest name', 'pivot_col')

In [37]:
sos_totals['pivot_col'].nunique()

42

In [38]:
#apply function to get a VEST column
sos_totals['VEST'] = sos_totals['pivot_col'].apply(lambda x: hlp.get_VEST(str(x).strip()))

In [39]:
# Check
sos_totals.head(1)

Unnamed: 0,line number,contest name,choice name,party name,total votes,percent of votes,registered voters,ballots cast,num Precinct total,num Precinct rptg,over votes,under votes,pivot_col,VEST
0,1,Lieutenant Governor - Dem,Charlie Bailey,DEM,162771,63.05,6465202,262207,159,159,0,11,Charlie Bailey -:- Lieutenant Governor - Dem,R22LTGDBAI


In [40]:
print('Number of contests of interest:')
print('Statewide Summary CSV: ',sos_totals['VEST'].nunique())
print('RDH precint file: ', len(precincts_cols))

Number of contests of interest:
Statewide Summary CSV:  42
RDH precint file:  42


In [41]:
#Final check to make sure columsn to compare are the same across precinctlevel df and statewide df
set(precincts_cols) == set(sos_totals['VEST'].unique())

True

Check rdh data against state summary

In [42]:
statewide_check_list = []
doesnt_check = []
for item in contest_updates_dict.values():
    official_ls = list(sos_totals.loc[sos_totals["VEST"] == item, "total votes"])
    if len(official_ls)<1:
        doesnt_check.append(item)
#         print(item)
#         print(contest_updates_reversed[item])
    else:
        official = official_ls[0]
    rdh = ga_22_primr_sw_pvt[item].sum()
    if official != rdh:
        statewide_check_list.append(item)
        print(contest_updates_reversed[item])
        print(f"{item}\n\tOfficial: {official}\n\tRDH: {rdh}")

### Checking against official County level results

Read in County level summary XML file from Georgia SOS office.

In [43]:
loaded_counties = os.listdir("./raw-from-source/summary/county_checks")
z=[]
for locale in loaded_counties:
    if locale.endswith('.xml'):
        file_string = "./raw-from-source/summary/county_checks/"+locale
        xtree = et.parse(file_string)
        xroot = xtree.getroot()
        state_area = xroot.findall(".//Region")
        for i in state_area:
            state = i.text
        contests = xroot.findall(".//Contest")
        for i in contests:
            contest = i.attrib.get('text')
            lower = i.findall("./Choice")
            for j in lower:
                choice = j.attrib.get('text')
                lower_2 = j.findall("./VoteType")
                for k in lower_2:
                    voting_method = k.attrib.get('name')
                    lower_3 = k.findall("./County")
                    for l in lower_3:
                        county_name = l.attrib.get('name')
                        num_votes = l.attrib.get('votes')
                        if locale == "detail 2.xml":
                            elec_type = "special"
                        else:
                            elec_type = "primary"
                        z.append([state,contest,choice,voting_method,county_name,num_votes, elec_type])
dfcols = ['state','contest','choice','voting_method','county','num_votes',"type"]
df_county = pd.DataFrame(z,columns=dfcols)

In [44]:
# identify contests pertaining to statewide legislative bodies, or offices only
soscnty_all_contests = list(df_county['contest'].unique())
soscnty_keep_contests = hlp.contests_to_keep(soscnty_all_contests, keywords_list)
print('All contests: '+ str(len(soscnty_all_contests)))
print('Subset of contests to keep: ' + str(len(soscnty_keep_contests)))

All contests: 23
Subset of contests to keep: 21


In [45]:
# filter df by contests of interest
filtered_county_totals = df_county[df_county['contest'].isin(soscnty_keep_contests)].copy()
# set votes column as integer
filtered_county_totals["num_votes"] = filtered_county_totals["num_votes"].astype(int)

In [46]:
# add 'pivot' col
sos_county_pvt = hlp.create_pivot_col(filtered_county_totals, 'choice', 'contest', 'pivot')
# use pivot col to add column with VEST names
sos_county_pvt['VEST'] = sos_county_pvt['pivot'].apply(lambda x: hlp.get_VEST(str(x).strip()))

In [47]:
# Check if there are 42 unique candidates in the VEST column
sos_county_pvt['VEST'].nunique() == 42

True

In [48]:
#Check if VEST contest names are the same as precinct df
set(precincts_cols) == set(sos_county_pvt['VEST'])

True

In [49]:
#pivot SOS county df
sos_county_totals_pvt =pd.pivot_table(sos_county_pvt,index=['county'],columns=['VEST'],values=['num_votes'],aggfunc=sum)
sos_county_totals_pvt = sos_county_totals_pvt.fillna(0)
sos_county_totals_pvt.columns = sos_county_totals_pvt.columns.droplevel(0)
sos_county_totals_pvt.reset_index(inplace = True)

#### Check precinct DF against SOS county level df

In [50]:
#check by contest
rdh = ga_22_primr_sw_pvt
sos = sos_county_totals_pvt
partner_name = 'SOS'
source_name = 'RDH'
county_col = 'county'
hlp.county_totals_check(sos,partner_name, rdh, source_name, precincts_cols, county_col,full_print=False, method='county')

***Countywide Totals Check***

Counties that match:

['Appling', 'Atkinson', 'Bacon', 'Baker', 'Baldwin', 'Banks', 'Barrow', 'Bartow', 'Ben Hill', 'Berrien', 'Bibb', 'Bleckley', 'Brantley', 'Brooks', 'Bryan', 'Bulloch', 'Burke', 'Butts', 'Calhoun', 'Camden', 'Candler', 'Carroll', 'Catoosa', 'Charlton', 'Chatham', 'Chattahoochee', 'Chattooga', 'Cherokee', 'Clarke', 'Clay', 'Clayton', 'Clinch', 'Cobb', 'Coffee', 'Colquitt', 'Columbia', 'Cook', 'Coweta', 'Crawford', 'Crisp', 'Dade', 'Dawson', 'DeKalb', 'Decatur', 'Dodge', 'Dooly', 'Dougherty', 'Douglas', 'Early', 'Echols', 'Effingham', 'Elbert', 'Emanuel', 'Evans', 'Fannin', 'Fayette', 'Floyd', 'Forsyth', 'Franklin', 'Fulton', 'Gilmer', 'Glascock', 'Glynn', 'Gordon', 'Grady', 'Greene', 'Gwinnett', 'Habersham', 'Hall', 'Hancock', 'Haralson', 'Harris', 'Hart', 'Heard', 'Henry', 'Houston', 'Irwin', 'Jackson', 'Jasper', 'Jeff Davis', 'Jefferson', 'Jenkins', 'Johnson', 'Jones', 'Lamar', 'Lanier', 'Laurens', 'Lee', 'Liberty', 'Lincoln', 'Long',

No Vote Total discrepancies to address

### Create README

In [51]:
# Create list of ordered contests for non districted contests
contests_order =[]
for i in ['US Senate', 'Governor', 'Lieutenant Governor', 'Attorney General', 'Secretary of State', 'Commissioner of Agriculture', 'Commissioner of Insurance', 'Commissioner of Labor',  'Public Service Commissioner', 'State School Superintendent', 'Supreme Court', 'Court of Appeals']:
    temp_df = contests_stnd.loc[contests_stnd['race'] == i]
    contests_sorted = sorted(temp_df['new_race'].unique().tolist())
    contests_order += contests_sorted
#check
#len(contests_order) == contests_stnd['new_race'].nunique()

In [52]:
# create list of ordered contests for districted contests
for i in ['US House', 'State Senate', 'State House']:
    temp_df = contests_stnd.loc[contests_stnd['race'] == i].copy()
    temp_df['district'] = temp_df['district'].astype(int) #ensure districts are being sorted as integers not strings
    contests_sorted = temp_df.sort_values('district')['new_race'].unique().tolist()
    contests_order += contests_sorted

In [53]:
# Create df with total votes per candidate
rm_df = ga_22_primr_sw.groupby(['stnd_contest', 'choice', 'pivot'])['num_votes'].sum().reset_index()
#apply rename dictionary to get VEST names
rm_df['VEST'] = rm_df['pivot'].map(contest_updates_dict).fillna(0)

In [54]:
rm_df.head(2)

Unnamed: 0,stnd_contest,choice,pivot,num_votes,VEST
0,Commissioner of Insurance - Dem,Janice Laws Robinson,Janice Laws Robinson -:- Commissioner of Insur...,158734,R22INSDROB
1,Commissioner of Insurance - Dem,Raphael Baker,Raphael Baker -:- Commissioner of Insurance - Dem,90317,R22INSDBAK


In [55]:
# create list to order README and final dataset by contest, and vote share
rm_order =[]
for i in contests_order:
    temp = rm_df.loc[rm_df['stnd_contest'] == i].sort_values('num_votes', ascending = False)
    rm_order += temp['VEST'].to_list()
#check to make sure all contests included
set(rm_order) == set(precincts_cols)

True

In [56]:
#Create order column, mapping ordering list to VEST names
rm_df['Order'] = rm_df['VEST'].map(lambda x: rm_order.index(x))
#order the readme DF
rm_df = rm_df.sort_values('Order')
#Create field name column
rm_df['candidate'] = rm_df['choice'].apply(lambda x: str(x).split()[-1] + ', ' + ' '.join(str(x).split()[:-1]))
rm_df['description'] = rm_df['candidate'] + ' - ' + rm_df['stnd_contest']
#create fields_dict
fields_dict = dict(zip(rm_df['VEST'], rm_df['description']))

In [57]:
# Create README
fields_dict['UNIQUE_ID']='Unique ID for each precinct'
fields_dict['COUNTYFP']='County FIP identifier'
fields_dict['county']='County Name'
fields_dict['precinct']='Precinct Name'

title = "Georgia 2022 Primary Election Precinct-Level Results"
retrieval_date = "09/13/23"
fields_dict = fields_dict
github_link = "https://github.com/nonpartisan-redistricting-datahub/pber_collection/tree/main/GA/2022"
file_folder = "./"
source = "The RDH retrieved 2022 precinct election precinct-level results from the Georgia Secretary of State [website] (https://results.enr.clarityelections.com/GA/114891/web.285569/#/reporting). The RDH navigated to each county's election results page and clicked 'Detail XML', to get the results at the precinct level"

In [58]:
if not os.path.exists(file_folder):
    os.mkdir(file_folder)

with open(file_folder+"README.txt", 'w') as tf:
        tf.write(hlp.full_readme_text(title, retrieval_date, source, fields_dict, github_link))

<p><a name="exp"></a></p>

### Export Cleaned Precinct Level Dataset

In [59]:
rm_order = ['UNIQUE_ID', 'COUNTYFP', 'county', 'precinct'] + rm_order

In [64]:
#checks
len(rm_order) == len(ga_22_primr_sw_pvt.columns)
set(rm_order) == set(ga_22_primr_sw_pvt.columns)

True

In [61]:
#reorder df
ga_22_primr_sw_pvt = ga_22_primr_sw_pvt[rm_order]

In [62]:
ga_22_primr_sw_pvt.head(2)

pivot,UNIQUE_ID,COUNTYFP,county,precinct,R22LTGDBAI,R22LTGDHAL,R22SOSDNGU,R22SOSDDAW,R22INSDROB,R22INSDBAK,...,RSL061DBRU,RSL061DKEM,RSL086DBAR,RSL086DADA,RSL090DDRA,RSL090DSCH,RSL117RDAN,RSL117RKAH,RSL179RTOW,RSL179RDUN
0,001-1B,1,Appling,1B,8,12,16,3,10,9,...,0,0,0,0,0,0,0,0,0,0
1,001-1C,1,Appling,1C,4,0,1,3,1,3,...,0,0,0,0,0,0,0,0,0,0


In [63]:
if not os.path.exists("./ga_22_primary_runoff_prec/"):
    os.mkdir("./ga_22_primary_runoff_prec/")

ga_22_primr_sw_pvt.to_csv("./ga_22_primary_runoff_prec/ga_22_primary_runoff_prec.csv", index = False)