### Georgia 2022 Primary Election Returns

#### Sections
- <a href="#ETL">Cleaning Precinct-Level Election Results</a><br>
- <a href="#check">Vote Totals Checks</a><br>
- <a href ="#discrepancies"> Examine/Address Discrepancies <a/><br>
- <a href="#readme">Creating README</a><br>
- <a href="#exp">Exporting Cleaned Precinct-Level Dataset</a><br>


#### Sources
Precint-level data from [Georgia Secretary of State Certified Results by County - XML format](https://results.enr.clarityelections.com/GA/113667/web.285569/#/access-to-races).

Statewide county-level data to run checks from [Georgia Secretary of State Statewide Summary Report - CSV format](https://results.enr.clarityelections.com//GA//113667/294374/reports/summary.zip)
    
County-level data to run checks from [Georgia Secretary of State County Summary Report - XML format](https://results.enr.clarityelections.com//GA//113667/294374/reports/detailxml.zip)

In [1]:
import geopandas as gp
import pandas as pd
import os
import xml.etree.ElementTree as et
import numpy as np
import re
import GA22_primary_helper as hlp
pd.set_option("display.max_rows", None)

<p><a name="ETL"></a></p>

### Cleaning Precint Level Election Results

Read in precint level election results

In [2]:
df_primary = hlp.ph_clarityelec_xml("./raw-from-source/counties/", 'primary')

In [3]:
#concatenate df, make sure all counties are read in
ga_22_primary = pd.concat([df_primary])
print(len(ga_22_primary["county"].unique()) == 159) #159 counties in GA

True


In [4]:
#Check df dtypes, set vote column type to int
# ga_22_primary.info()
ga_22_primary.num_votes = ga_22_primary.num_votes.astype(int)
ga_22_primary.num_votes.dtype

dtype('int32')

#### Subset dataframe by contests with statewide reach

In [5]:
keywords_list = ['US Senate', 'US House', 'Governor', 'Attorney General', 'Secretary of State', 'State School Superintendent', 'Agriculture', 'Labor', 'Insurance', 'PSC', 'Public Service', 'Supreme Court', 'Court of Appeals', 'State Senate', 'State House']

Create list of all contests, and subset by keywords

In [6]:
all_contests = ga_22_primary['contest'].unique().tolist()
keep_contests = hlp.contests_to_keep(all_contests, keywords_list)
keep_contests.sort()
print('Unique contests in dataset: '+ str(len(all_contests)))
print('Contests to keep: ' + str(len(keep_contests)))

Unique contests in dataset: 1367
Contests to keep: 578


Visually inspect remaining contests, as a check

In [7]:
# #check remaining contests visually
# other_contests = set(all_contests) - set(keep_contests)
# other_contests

In [8]:
#subset dataframe by contests of interest only
ga_22_primary_sw = ga_22_primary[ga_22_primary['contest'].isin(keep_contests)].copy()

#### Standardize contest names

In [9]:
#Create df with unique contest names
contests = pd.DataFrame(keep_contests)
contests.columns = ['orig']

Visually inspect contest name variations, create functions to standardize contest type

In [10]:
#set(keep_contests)

In [11]:
#Functions to split contest string into office type, district, and party
contest_types = ['Attorney General', 'Commissioner of Agriculture', 'Commissioner of Insurance', 'Commissioner of Labor', 'Court of Appeals', 'Governor', 'Lieutenant Governor', 'Liutenant Governor', 'Public Service Commissioner', 'Secretary of State', 'State House', 'State School Superintendent', 'State Senate', 'US House', 'Supreme Court', 'US Senate','PSC']
def stnd_race(contest_string):
    standardized_race = ''
    for i in contest_types:
        if i in contest_string:
            standardized_race = i      
    return standardized_race

def stnd_dist(race_string):
    race_string = race_string.lower()
    if '/' in race_string:
        race_string = race_string.split('/')[0]
    if any(word in race_string for word in ['us house', 'state senate', 'public service commissioner', 'psc']):
        district = ''.join(filter(str.isdigit, race_string))
    elif 'state house' in race_string:
        district = ''.join(filter(str.isdigit, race_string))
    else:
        district = ''
    return district

def get_party(race_string):
    if "dem" in race_string.lower():
        return "Dem"
    elif "rep" in race_string.lower():
        return "Rep"
    elif "supreme court" in race_string.lower() or "court of appeals" in race_string.lower():
        return "Nonpartisan"

Create new columns splitting contest string into office type, district, and party

In [12]:
contests['orig'] = contests['orig'].apply(lambda x: x.strip())
contests['party'] = contests['orig'].apply(lambda x: get_party(x))
contests['race'] = contests['orig'].apply(lambda x: stnd_race(x))
contests['district'] = contests['orig'].apply(lambda x: stnd_dist(x))

Check for nulls

In [13]:
# #checks
# contests[contests['race']== 'PSC']
# contests[contests['race'] == '']

Manually correct nulls, anomalies

In [14]:
# Change PSC to Public Service Commissioner
contests['race'] = contests['race'].replace('PSC', 'Public Service Commissioner')

# Fix Lieutenant Spelling
contests['race'] = contests['race'].replace('Liutenant Governor', 'Lieutenant Governor')

# Manually rename anomalous contests
contests.at[20, 'race'] =  'Commissioner of Agriculture'
contests.at[137, 'race'] = 'Commissioner of Agriculture'
contests.at[21, 'race'] = 'Commissioner of Insurance'
contests.at[138, 'race'] = 'Commissioner of Insurance'
contests.at[22, 'race'] = 'Commissioner of Labor'
contests.at[139, 'race'] = 'Commissioner of Labor'

In [15]:
#Create standardized contest names for non- districted contests
non_districted = contests[contests['district'] == ''].copy()
non_districted['new_race'] = non_districted['race'] + ' - ' + non_districted['party']
#Create standardized contest names for districted contests
districted = contests[contests['district'] != ''].copy()
districted['new_race'] = districted['race'] + ' - District '+ districted['district'] + ' - ' + districted['party']
#Combine into one DF
contests_stnd = pd.concat([non_districted, districted], axis=0)

In [16]:
#check new df has same number of unique contests as the original df
contests_stnd.shape[0] == contests.shape[0]

True

In [17]:
#Create dictionary of standardized contest names
contests_stnd_dict = dict(zip(contests_stnd['orig'], contests_stnd['new_race']))

In [18]:
#Apply dict to standardize contests in precint election returns df
ga_22_primary_sw['stnd_contest'] = ga_22_primary_sw['contest'].map(contests_stnd_dict)

print('Number of original "unique" contests: ', ga_22_primary_sw['contest'].nunique())
print('Number of standardized contests: ', ga_22_primary_sw['stnd_contest'].nunique())

Number of original "unique" contests:  578
Number of standardized contests:  402


#### Add FIPS Column, Create Unique ID Column

In [19]:
def create_fips_col(csv_path, state_name_string, df, county_col_string):
    fips_file = pd.read_csv(csv_path)
    fips_file = fips_file[fips_file["State"] == state_name_string]
    fips_file["FIPS County"] = fips_file["FIPS County"].astype(str)
    fips_file["FIPS County"] = fips_file["FIPS County"].str.zfill(3)
    fips_file['County Name'] = fips_file['County Name'].apply(lambda x: x.replace(' ', ''))
    fips_file['County Name'] = fips_file['County Name'].apply(lambda x: str(x).lower())
    fips_dict = dict(zip(fips_file['County Name'], fips_file['FIPS County']))
    df['COUNTYFP'] = df[county_col_string].apply(lambda x: str(x).lower())
    df['COUNTYFP'] = df['COUNTYFP'].map(fips_dict).fillna(df[county_col_string])
    df['COUNTYFP'] = df['COUNTYFP'].astype(str)
    df['COUNTYFP'] = df['COUNTYFP'].str.zfill(3)
    return df

In [20]:
#Add FIPS col to precinct df
ga_22_primary_sw = create_fips_col("./FIPS/US_FIPS_Codes.csv", 'Georgia', ga_22_primary_sw, 'county')

In [21]:
#Create UNIQUE_ID col
ga_22_primary_sw['UNIQUE_ID'] = ga_22_primary_sw['COUNTYFP'] + '-' +ga_22_primary['precinct']

In [22]:
#Check for 2707 precincts
ga_22_primary_sw['UNIQUE_ID'].nunique()

2707

#### Clean columns, create pivot column

In [23]:
#Add incumbency column
incumbency_mask = ga_22_primary_sw['choice'].str.contains(r'\(I\)')
ga_22_primary_sw['Incumbent'] = 0
ga_22_primary_sw.loc[incumbency_mask, 'Incumbent'] = 1
#ga_22_prim_incumbent_dict = dict(zip(ga_22_primary_sw['VEST'], ga_22_primary_sw['Incumbent']))

In [24]:
#Remove incumbent status from candidate name
ga_22_primary_sw['choice'] = ga_22_primary_sw['choice'].apply(lambda x: x.replace('(I)', ''))

Create pivot col, of choice + contest

In [25]:
#Function cleans candidate and contest strings, and combines into a pivot column
def create_pivot_col(df, name_string, contest_string, pivot_string):
    df[name_string] = df[name_string].apply(lambda x: str(x).strip())
    df[contest_string] = df[contest_string].apply(lambda x: str(x).strip())
    df[name_string] = df[name_string].apply(lambda x:' '.join(str(x).split())) # This removes extra spaces between first and last name
    substrings_to_remove = ['.', "'", '"', ',', '(I)']
    for substring in substrings_to_remove:
        df[name_string] = df[name_string].apply(lambda x: x.replace(substring, ''))
        df[contest_string] = df[contest_string].apply(lambda x: x.replace(substring, ''))
    #Anomalies specific to this election
    df[name_string] = df[name_string].apply(lambda x: x.replace('Deloach', 'DeLoach'))
    df[name_string] = df[name_string].apply(lambda x: x.replace('Tabitha Johnson- Green', 'Tabitha Johnson-Green'))
    df[name_string] = df[name_string].apply(lambda x: str(x).strip())
    df[contest_string] = df[contest_string].apply(lambda x: str(x).strip())
    df[pivot_string]= df[name_string]+ ' -:- ' + df[contest_string]
    return df

In [26]:
#create pivot col for precinct df
ga_22_primary_sw = create_pivot_col(ga_22_primary_sw, 'choice', 'stnd_contest', 'pivot')

In [27]:
#Check
ga_22_primary_sw.head(2)

Unnamed: 0,county,contest,choice,voting_method,precinct,num_votes,election,stnd_contest,COUNTYFP,UNIQUE_ID,Incumbent,pivot
0,Appling,US Senate - Rep,Gary W Black,Election Day Votes,1B,41,primary,US Senate - Rep,1,001-1B,0,Gary W Black -:- US Senate - Rep
1,Appling,US Senate - Rep,Gary W Black,Election Day Votes,1C,46,primary,US Senate - Rep,1,001-1C,0,Gary W Black -:- US Senate - Rep


#### Pivot Data

In [28]:
ga_22_primary_sw_pvt =pd.pivot_table(ga_22_primary_sw,index=["UNIQUE_ID","county","COUNTYFP","precinct"],columns=["pivot"],values=['num_votes'],aggfunc=sum).fillna(0)

In [29]:
#Clean up indexing
ga_22_primary_sw_pvt.columns = ga_22_primary_sw_pvt.columns.droplevel(0)
ga_22_primary_sw_pvt.reset_index(inplace = True)

In [30]:
#check
ga_22_primary_sw_pvt.head(1)

pivot,UNIQUE_ID,county,COUNTYFP,precinct,Adam Petty -:- State Senate - District 38 - Dem,Afoma Eguh Okafor -:- State House - District 71 - Dem,Al Williams -:- State House - District 168 - Dem,Al Wynn -:- State House - District 153 - Dem,Alan Powell -:- State House - District 33 - Rep,Alan Sims -:- US House - District 10 - Rep,...,William C Harris -:- State House - District 126 - Rep,William Harris -:- State House - District 74 - Dem,William Park Freeman -:- State House - District 88 - Rep,William Will Boddie Jr -:- Commissioner of Labor - Dem,Willie Mae Oyogoa -:- State House - District 44 - Dem,Winfred Dukes -:- Commissioner of Agriculture - Dem,Yasmin Neal -:- State House - District 79 - Dem,Yg Nyghtstorm -:- US House - District 7 - Rep,Zach Procter -:- State House - District 101 - Rep,Zeph Baker -:- State House - District 140 - Dem
0,001-1B,Appling,1,1B,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,10.0,0.0,5.0,0.0,0.0,0.0,0.0


Adapt functions
Georgia Primary specific modifications:
- General election for supreme court and court of appeals races
- 180 seats in state house, requiring 3 character integers
- at-large public service commissioner race included

In [31]:
#functions to rename columns
def get_election_type_year(race_string):
    if any(word in race_string.lower() for word in ['court of appeals', 'supreme court']):
         electype = "G"
    else:
        electype = "P"
    if any(word in race_string.lower() for word in ['us house', 'state house', 'state senate', 'public service commissioner', 'psc']):
        return electype
    else:
        return electype +"22"
    
def get_race(race_string):
    race_string = race_string.lower()
    if '/' in race_string:
        race_string = race_string.split('/')[0]
    race = ''
    if "u.s. house" in race_string or 'us house' in race_string:
        race = "CON"
    elif "state house" in race_string:
        race =  "SL"
    elif "state senate" in race_string:
        race = "SU"
    elif "us senate" in race_string or "u.s senate" in race_string:
        race = "USS"
    elif "public service" in race_string:
        race = "PSC"
    elif "attorney general" in race_string:
        race = "ATG"
    elif "auditor general" in race_string:
        race = "AUD"
    elif "treasurer" in race_string:
        race = "TRE"
    elif "superintendent" in race_string:
        race = "SUP"
    elif "secretary of state" in race_string:
        race = "SOS"
    elif "lieutenant governor" in race_string or 'liutenant governor' in race_string:
        race = "LTG"
    elif "governor" in race_string:
        race = "GOV"
    elif "commissioner of labor" in race_string:
        race = "LAB"
    elif "commissioner of agriculture" in race_string:
        race = "AGR"
    elif "commissioner of insurance" in race_string:
        race = "INS"
    elif "state school superintendent" in race_string:
        race = "SUP"
    elif "public service commissioner" in race_string or 'psc' in race_string:
        race = "PSC"
    elif "supreme court" in race_string:
        race = "SSC"
    elif "court of appeals" in race_string:
        race = "COA"
    if any(word in race_string for word in ['us house', 'state senate', 'public service commissioner', 'psc']):
        district = ''.join(filter(str.isdigit, race_string)).zfill(2)
    elif 'state house' in race_string:
        district = ''.join(filter(str.isdigit, race_string)).zfill(3)
    else:
        district = ''
    return race + district

def get_party(race_string): #order is D first, then R, to account for instances of 'house of REPresentatives'
    if "dem" in race_string.lower():
        return "D"
    elif "rep" in race_string.lower():
        return "R"
    elif "supreme court" in race_string.lower() or "court of appeals" in race_string.lower():
        return "N"
    
def get_name(name_string):
    name_string = name_string.split("-:-")[0]
    name_string = name_string.replace("'","")
    name_string = name_string.replace('"','')
    name_string = name_string.replace(',','')
    name_string = name_string.strip()
    if name_string.split(" ")[-1] in ['II', 'III', 'Jr', 'Jr.', 'Sr.', 'JR.', "JR", "IV", 'Jr', 'Sr']:
            likely_last = name_string.split(" ")[-2]
    else:
        likely_last = name_string.split(" ")[-1]
    return likely_last[:3].upper()

def get_VEST(race_string):
    electype = get_election_type_year(race_string)
    contest = get_race(race_string)
    party = get_party(race_string)
    candidate = get_name(race_string)
    vest_name = electype+contest+party+candidate
    if len(vest_name) > 10:
        print(vest_name)
    return vest_name

Create function that creates dictionary to rename columns in pivoted df

In [32]:
def create_column_rename_dicts(df, exclude_columns):
    contest_columns = [i for i in df.columns if i not in exclude_columns]

    contest_updates_dict = {}
    contest_updates_reversed = {}
    clean_dups = {}
    new_names = []
    
    for val in contest_columns:
        new_name = get_VEST(val)  # get_VEST
        contest_updates_dict[val] = new_name
        
        if new_name not in new_names:
            new_names.append(new_name)
            contest_updates_reversed[new_name] = val
        else:
            print("Duplicate", new_name)
            print(contest_updates_reversed[new_name])
            print(val)
            clean_dups[val] = contest_updates_reversed[new_name]
    
    return contest_updates_dict, contest_updates_reversed, clean_dups

In [33]:
#Create dictionaries for renaming pivot col to VEST
exclude_columns = ['UNIQUE_ID', 'county', 'COUNTYFP', 'precinct']
contest_updates_dict, contest_updates_reversed, clean_dups = create_column_rename_dicts(ga_22_primary_sw_pvt, exclude_columns)

Duplicate P22LTGDBRO
Tony Brown -:- Lieutenant Governor - Dem
Tyrone Brooks Jr -:- Lieutenant Governor - Dem


In [34]:
#Correct naming convention for candidates with similar last name in Lieutenant Governor - Dem contest
contest_updates_dict['Tony Brown -:- Lieutenant Governor - Dem'] = 'P22LTGDBTO'
contest_updates_dict['Tyrone Brooks Jr -:- Lieutenant Governor - Dem'] = 'P22LTGDBTY'
contest_updates_reversed['P22LTGDBTO'] = 'Tony Brown -:- Lieutenant Governor - Dem'
contest_updates_reversed['P22LTGDBTY'] = 'Tyrone Brooks, Jr -:- Lieutenant Governor - Dem'

In [35]:
#Check all dict values under 10 characters
for item in contest_updates_dict.values():
    if len(item) > 10 or len(item) < 7:
        print(item)
        print(contest_updates_reversed[item])

In [36]:
#apply rename dictionary to pivoted df
ga_22_primary_sw_pvt.rename(columns = contest_updates_dict, inplace = True)
ga_22_primary_sw_pvt.reset_index(inplace = True, drop = True)

In [37]:
#set columns with votes as integer type
for item in contest_updates_dict.values():
    ga_22_primary_sw_pvt[item] = ga_22_primary_sw_pvt[item].astype(int)

In [38]:
#check
ga_22_primary_sw_pvt.head(2)

pivot,UNIQUE_ID,county,COUNTYFP,precinct,PSU38DPET,PSL071DOKA,PSL168DWIL,PSL153DWYN,PSL033RPOW,PCON10RSIM,...,PSL126RHAR,PSL074DHAR,PSL088RFRE,P22LABDBOD,PSL044DOYO,P22AGRDDUK,PSL079DNEA,PCON07RNYG,PSL101RPRO,PSL140DBAK
0,001-1B,Appling,1,1B,0,0,0,0,0,0,...,0,0,0,10,0,5,0,0,0,0
1,001-1C,Appling,1,1C,0,0,0,0,0,0,...,0,0,0,3,0,6,0,0,0,0


In [39]:
#Create list of candidates in precinct level df
precincts_cols = ga_22_primary_sw_pvt.columns[4:].to_list()
len(precincts_cols) #Number of candidates in precint-level results, useful to compare in vote totals checking

655

<p><a name="check"></a></p>

## Vote Totals Check
### Statewide

In [40]:
#Read in statewide csv summary file, with county level data
combined_sos_totals = pd.read_csv("./raw-from-source/summary/summary.csv")

Clean contest name

In [41]:
#Remove 'Vote for 1'
combined_sos_totals['contest name'] = combined_sos_totals['contest name'].apply(lambda x: str(x).strip().replace('(Vote For 1)', ''))

Filter for contests of interest

In [42]:
#Create list of contests statewide
sos_all_contests = list(combined_sos_totals['contest name'].unique())
sos_keep_contests = hlp.contests_to_keep(sos_all_contests, keywords_list)
print('All contests:'+ str(len(sos_all_contests)))
print('Subset of contests to keep:' + str(len(sos_keep_contests)))

All contests:515
Subset of contests to keep:406


In [43]:
# #check remaining contests
# sos_other_contests = set(sos_all_contests) - set(sos_keep_contests)
# sos_other_contests

In [44]:
#Subset statewide results by contests of interest only
filtered_sos_totals = combined_sos_totals[combined_sos_totals['contest name'].isin(sos_keep_contests)].copy()

Clean columns to match precinct-wise df

In [45]:
#Create pivot column
sos_totals = create_pivot_col(filtered_sos_totals, 'choice name', 'contest name', 'pivot_col')

In [46]:
sos_totals['pivot_col'].nunique()

655

Use function to rename VEST columns

In [47]:
#apply function to get a VEST column
sos_totals['VEST'] = sos_totals['pivot_col'].apply(lambda x: get_VEST(str(x).strip()))

In [48]:
# Check
sos_totals.head(1)

Unnamed: 0,line number,contest name,choice name,party name,total votes,percent of votes,registered voters,ballots cast,num Precinct total,num Precinct rptg,over votes,under votes,pivot_col,VEST
0,1,US Senate - Rep,Gary W Black,REP,157370,13.35,0,0,159,159,0,100,Gary W Black -:- US Senate - Rep,P22USSRBLA


Manually adjust LTG D contest VEST naming, to account for two candidates with similar last names

In [49]:
#sos_totals[sos_totals['VEST'] == 'P22LTGDBRO']

In [50]:
#Fix two candidates with similar last name
sos_totals.at[20, 'VEST'] = 'P22LTGDBTY'
sos_totals.at[21, 'VEST'] = 'P22LTGDBTO'

In [51]:
print('Number of contests of interest:')
print('Statewide Summary CSV: ',sos_totals['VEST'].nunique())
print('RDH precint file: ', len(precincts_cols))

Number of contests of interest:
Statewide Summary CSV:  655
RDH precint file:  655


Check that VEST names match up 

In [52]:
print('Number of contests in RDH file only:', len(set(precincts_cols) - set(sos_totals['VEST'].unique())))
print('Number of contests in SOS file only:', len(set(sos_totals['VEST'].unique()) - set(precincts_cols)))

Number of contests in RDH file only: 0
Number of contests in SOS file only: 0


In [53]:
# Looking at contests in precinct only file for naming errors
vest_issues_list = set(precincts_cols) - set(sos_totals['VEST'].unique())
for item in vest_issues_list:
    print(item)
    print(contest_updates_reversed[item])
    #display(sos_totals[sos_totals['VEST'] == item])
    display(ga_22_primary_sw[ga_22_primary_sw['pivot'] == contest_updates_reversed[item]].head(1))

In [54]:
#Final check to make sure columsn to compare are the same across precinctlevel df and statewide df
set(precincts_cols) == set(sos_totals['VEST'].unique())

True

Check rdh data against state summary

In [55]:
statewide_check_list = []
doesnt_check = []
for item in contest_updates_dict.values():
    official_ls = list(sos_totals.loc[sos_totals["VEST"] == item, "total votes"])
    if len(official_ls)<1:
        doesnt_check.append(item)
#         print(item)
#         print(contest_updates_reversed[item])
    else:
        official = official_ls[0]
    rdh = ga_22_primary_sw_pvt[item].sum()
    if official != rdh:
        statewide_check_list.append(item)
        print(contest_updates_reversed[item])
        print(f"{item}\n\tOfficial: {official}\n\tRDH: {rdh}")

Anne Elizabeth Barnes -:- Court of Appeals - Nonpartisan
G22COANBAR
	Official: 1626523
	RDH: 1629284
Carla McMillian -:- Supreme Court - Nonpartisan
G22SSCNMCM
	Official: 1626408
	RDH: 1629156
Chris McFadden -:- Court of Appeals - Nonpartisan
G22COANMCF
	Official: 1620325
	RDH: 1623087
Darlene Taylor -:- State House - District 173 - Rep
PSL173RTAY
	Official: 1730
	RDH: 6746
Keith L Jenkins Sr -:- State House - District 173 - Dem
PSL173DJEN
	Official: 650
	RDH: 2781
Sanford Bishop -:- US House - District 2 - Dem
PCON02DBIS
	Official: 54991
	RDH: 57396
Shawn Ellen LaGrua -:- Supreme Court - Nonpartisan
G22SSCNLAG
	Official: 1623542
	RDH: 1626330
Trea Pipkin -:- Court of Appeals - Nonpartisan
G22COANPIP
	Official: 1606449
	RDH: 1609183
Verda M Colvin -:- Supreme Court - Nonpartisan
G22SSCNCOL
	Official: 1168175
	RDH: 1170137
Veronica Brinson -:- Supreme Court - Nonpartisan
G22SSCNBRI
	Official: 541628
	RDH: 542561


### Checking against official County level results

Read in County level summary XML file from Georgia SOS office.

In [56]:
loaded_counties = os.listdir("./raw-from-source/summary/county_checks")
z=[]
for locale in loaded_counties:
    if locale.endswith('.xml'):
        file_string = "./raw-from-source/summary/county_checks/"+locale
        xtree = et.parse(file_string)
        xroot = xtree.getroot()
        state_area = xroot.findall(".//Region")
        for i in state_area:
            state = i.text
        contests = xroot.findall(".//Contest")
        for i in contests:
            contest = i.attrib.get('text')
            lower = i.findall("./Choice")
            for j in lower:
                choice = j.attrib.get('text')
                lower_2 = j.findall("./VoteType")
                for k in lower_2:
                    voting_method = k.attrib.get('name')
                    lower_3 = k.findall("./County")
                    for l in lower_3:
                        county_name = l.attrib.get('name')
                        num_votes = l.attrib.get('votes')
                        if locale == "detail 2.xml":
                            elec_type = "special"
                        else:
                            elec_type = "primary"
                        z.append([state,contest,choice,voting_method,county_name,num_votes, elec_type])
dfcols = ['state','contest','choice','voting_method','county','num_votes',"type"]
df_county = pd.DataFrame(z,columns=dfcols)

Clean County Level SOS results

In [57]:
# identify contests pertaining to statewide legislative bodies, or offices only
soscnty_all_contests = list(df_county['contest'].unique())
soscnty_keep_contests = hlp.contests_to_keep(soscnty_all_contests, keywords_list)
print('All contests: '+ str(len(soscnty_all_contests)))
print('Subset of contests to keep: ' + str(len(soscnty_keep_contests)))

All contests: 515
Subset of contests to keep: 406


In [58]:
# filter df by contests of interest
filtered_county_totals = df_county[df_county['contest'].isin(soscnty_keep_contests)].copy()
# set votes column as integer
filtered_county_totals["num_votes"] = filtered_county_totals["num_votes"].astype(int)

In [59]:
# add FIPS column for each county
sos_county_fips = create_fips_col("./FIPS/US_FIPS_Codes.csv", 'Georgia', filtered_county_totals, 'county')

In [60]:
# add 'pivot' col
sos_county_pvt = create_pivot_col(sos_county_fips, 'choice', 'contest', 'pivot')
# use pivot col to add column with VEST names
sos_county_pvt['VEST'] = sos_county_pvt['pivot'].apply(lambda x: get_VEST(str(x).strip()))

In [61]:
# Manually edit the LTG D contest names for candidates with similar last name
ty_mask = sos_county_pvt['choice'] == 'Tyrone Brooks Jr'
to_mask = sos_county_pvt['choice'] == 'Tony Brown'
sos_county_pvt.loc[ty_mask, 'VEST'] = 'P22LTGDBTY'
sos_county_pvt.loc[to_mask, 'VEST'] = 'P22LTGDBTO'

In [62]:
# Check if there are 655 unique candidates in the VEST column
sos_county_pvt['VEST'].nunique() == 655

True

In [63]:
#Check if VEST contest names are the same as precinct df
set(precincts_cols) == set(sos_county_pvt['VEST'])

True

In [64]:
#pivot SOS county df
sos_county_totals_pvt =pd.pivot_table(sos_county_pvt,index=['county'],columns=['VEST'],values=['num_votes'],aggfunc=sum)
sos_county_totals_pvt = sos_county_totals_pvt.fillna(0)
sos_county_totals_pvt.columns = sos_county_totals_pvt.columns.droplevel(0)
sos_county_totals_pvt.reset_index(inplace = True)

#### Check precinct DF against SOS county level df

In [65]:
#check by contest
rdh = ga_22_primary_sw_pvt
sos = sos_county_totals_pvt
partner_name = 'SOS'
source_name = 'RDH'
county_col = 'county'
hlp.county_totals_check(sos,partner_name, rdh, source_name, precincts_cols, county_col,full_print=False, method='county')

***Countywide Totals Check***

Cook contains differences in these races:
	G22COANBAR has a difference of -2761.0 vote(s)
		SOS: 0.0 vote(s)
		RDH: 2761 vote(s)
	G22SSCNMCM has a difference of -2773.0 vote(s)
		SOS: 0.0 vote(s)
		RDH: 2773 vote(s)
	G22COANMCF has a difference of -2762.0 vote(s)
		SOS: 0.0 vote(s)
		RDH: 2762 vote(s)
	G22SSCNLAG has a difference of -2763.0 vote(s)
		SOS: 0.0 vote(s)
		RDH: 2763 vote(s)
	G22COANPIP has a difference of -2734.0 vote(s)
		SOS: 0.0 vote(s)
		RDH: 2734 vote(s)
	G22SSCNCOL has a difference of -1962.0 vote(s)
		SOS: 0.0 vote(s)
		RDH: 1962 vote(s)
	G22SSCNBRI has a difference of -933.0 vote(s)
		SOS: 0.0 vote(s)
		RDH: 933 vote(s)
Grady contains differences in these races:
	PCON02DBIS has a difference of -1040.0 vote(s)
		SOS: 0.0 vote(s)
		RDH: 1040 vote(s)
Mitchell contains differences in these races:
	PCON02DBIS has a difference of -1365.0 vote(s)
		SOS: 0.0 vote(s)
		RDH: 1365 vote(s)
Pickens contains differences in these races:
	G22SSCNMCM 

<p><a name="discrepancies"></a></p>

### Addressing Discrepancies

##### Missing Votes
In all instances where the SOS county-level file had no reported votes in a contest in a specific county, vote numbers from the precinct-level compiled dataset are presumed to be accurate. 

1. SOS had no reported votes for the following judicial races in Cook County
    - Court of Appeals (G22COANBAR, G22COANMCF, G22COANPIP)
    - State Supreme Court (G22SSSCNMCM, G22SSCNLAG, G22SSCNCOL, G22SSCNBRI)
2. SOS had no reported votes for PCONO2 Democratic candidates in Grady and Mitchell Counties
    - PCON02DBIS - missing votes in Grady and Mitchell
    - PCON02DOHA - no data reported in Grady and Mitchell county
3. SOS had no reported votes from Thomas County in PSL173 Democratic and Republican contests
    - PSL173RTAY - missing votes in this contest
    - PSL173DJEN - missing votes in this contest
    
##### Vote Swapping
    
The vote discrepancies in Pickens and Whitfield county seem to indicate unintentional swapping of 9 and 16 votes in each county respectively. We are unable to determine whether this error originated at the precinct, or county level.
Given the differences found in other contests in the Secretary of State's county-level file, we are unable to support it's accuracy. We have chosen to leave the precinct-level vote totals unaltered in these contests.

1. Pickens County
    - 9 votes swapped between G22SSCNMCM and G22SSCNLAG contests
2. Whitfield County
    - 16 votes swapped between G22SSCNMCM and G22SSCNLAG contests

### Create README

In [66]:
# Create list of ordered contests for non districted contests
contests_order =[]
for i in ['US Senate', 'Governor', 'Lieutenant Governor', 'Attorney General', 'Secretary of State', 'Commissioner of Agriculture', 'Commissioner of Insurance', 'Commissioner of Labor',  'Public Service Commissioner', 'State School Superintendent', 'Supreme Court', 'Court of Appeals']:
    temp_df = contests_stnd.loc[contests_stnd['race'] == i]
    contests_sorted = sorted(temp_df['new_race'].unique().tolist())
    contests_order += contests_sorted
#check
#len(contests_order) == contests_stnd['new_race'].nunique()

In [67]:
# create list of ordered contests for districted contests
for i in ['US House', 'State Senate', 'State House']:
    temp_df = contests_stnd.loc[contests_stnd['race'] == i].copy()
    temp_df['district'] = temp_df['district'].astype(int) #ensure districts are being sorted as integers not strings
    contests_sorted = temp_df.sort_values('district')['new_race'].unique().tolist()
    contests_order += contests_sorted

In [68]:
# Create df with total votes per candidate
rm_df = ga_22_primary_sw.groupby(['stnd_contest', 'choice', 'pivot'])['num_votes'].sum().reset_index()
#apply rename dictionary to get VEST names
rm_df['VEST'] = rm_df['pivot'].map(contest_updates_dict).fillna(0)

In [69]:
rm_df.head(2)

Unnamed: 0,stnd_contest,choice,pivot,num_votes,VEST
0,Attorney General - Dem,Christian Wise Smith,Christian Wise Smith -:- Attorney General - Dem,153928,P22ATGDSMI
1,Attorney General - Dem,Jennifer Jen Jordan,Jennifer Jen Jordan -:- Attorney General - Dem,533266,P22ATGDJOR


In [70]:
# create list to order README and final dataset by contest, and vote share
rm_order =[]
for i in contests_order:
    temp = rm_df.loc[rm_df['stnd_contest'] == i].sort_values('num_votes', ascending = False)
    rm_order += temp['VEST'].to_list()
#check to make sure all contests included
set(rm_order) == set(precincts_cols)

True

In [71]:
#Create order column, mapping ordering list to VEST names
rm_df['Order'] = rm_df['VEST'].map(lambda x: rm_order.index(x))
#order the readme DF
rm_df = rm_df.sort_values('Order')
#Create field name column
rm_df['candidate'] = rm_df['choice'].apply(lambda x: str(x).split()[-1] + ', ' + ' '.join(str(x).split()[:-1]))
rm_df['description'] = rm_df['candidate'] + ' - ' + rm_df['stnd_contest']
#create fields_dict
fields_dict = dict(zip(rm_df['VEST'], rm_df['description']))

In [72]:
### Create README

fields_dict['UNIQUE_ID']='Unique ID for each precinct'
fields_dict['COUNTYFP']='County FIP identifier'
fields_dict['county']='County Name'
fields_dict['precinct']='Precinct Name'

title = "Georgia 2022 Primary Election Precinct-Level Results"
retrieval_date = "09/13/23"
fields_dict = fields_dict
github_link = "https://github.com/nonpartisan-redistricting-datahub/pber_collection/tree/main/GA/2022"
file_folder = "./"
source = "Georgia Secretary of State"

In [73]:
if not os.path.exists(file_folder):
    os.mkdir(file_folder)

with open(file_folder+"README.txt", 'w') as tf:
        tf.write(hlp.full_readme_text(title, retrieval_date, source, fields_dict, github_link))

<p><a name="exp"></a></p>

### Export Cleaned Precinct Level Dataset

In [74]:
rm_order = ['UNIQUE_ID', 'COUNTYFP', 'county', 'precinct'] + rm_order

In [75]:
#checks
len(rm_order) == len(ga_22_primary_sw_pvt.columns)
set(rm_order) == set(ga_22_primary_sw_pvt.columns)

True

In [76]:
#reorder df
ga_22_primary_sw_pvt = ga_22_primary_sw_pvt[rm_order]

In [77]:
ga_22_primary_sw_pvt.head(2)

pivot,UNIQUE_ID,COUNTYFP,county,precinct,P22USSDWAR,P22USSDJOH,P22USSRWAL,P22USSRBLA,P22USSRSAD,P22USSRCLA,...,PSL175RBLA,PSL176RBUR,PSL177DSHA,PSL178RMEE,PSL178RCAR,PSL179RTOW,PSL179RDUN,PSL179RKIL,PSL180RSAI,PSL180RSMI
0,001-1B,1,Appling,1B,32,1,411,78,3,24,...,0,0,0,0,0,0,0,0,0,0
1,001-1C,1,Appling,1C,18,2,317,67,4,9,...,0,0,0,36,6,0,0,0,0,0


In [78]:
if not os.path.exists("./ga_22_primary_prec/"):
    os.mkdir("./ga_22_primary_prec/")

ga_22_primary_sw_pvt.to_csv("./ga_22_primary_prec/ga_22_primary_prec.csv", index = False)