# Pre-processing Georgia XML County-level Election Data

In [1]:
%matplotlib inline
import math
import numpy as np
import scipy as sp
import scipy.optimize
from scipy.stats import hypergeom, binom, norm
from scipy import special
from cryptorandom.cryptorandom import SHA256
from cryptorandom import sample
from permute.utils import binom_conf_interval
import matplotlib.pyplot as plt
import pandas as pd
from lxml import etree
import csv

elec_fn = '../../Data/detail.xml'

In [2]:
elec = etree.parse(elec_fn)

In [3]:
e_root = elec.getroot()

Example record:

  <ElectionResult>
    <Timestamp>11/9/2018 8:13:23 PM EST</Timestamp>
    <ElectionName>November 6, 2018 - General Election</ElectionName>    
    <ElectionDate>11/6/2018</ElectionDate>    
    <Region>Richmond</Region>    
    
    <VoterTurnout ballotsCast="70355" totalVoters="122747" voterTurnout="57.32">
      <Precincts>            
        <Precinct ballotsCast="536" name="101" percentReporting="4" totalVoters="830" voterTurnout="64.58"/>
      </Precincts>    
    </VoterTurnout>

<Contest isQuestion="false" key="7" precinctsReported="68" precinctsReporting="68" text="Governor">  
        
    <VoteType name="Number of Precincts for Race" votes="68">
    
    <Choice key="40" text="BRIAN KEMP  (REP)" totalVotes="22075">
                    
        <VoteType name="Election Day" votes="13178">
           <Precinct name="101" votes="160"/>                      
        </VoteType>
             
        <VoteType name="Absentee By Mail" votes="1174">
          <Precinct name="101" votes="67"/>                      
        </VoteType>
                  
        <VoteType name="Advance in Person" votes="7706">               
          <Precinct name="101" votes="67"/>          
        </VoteType>
        
            
        <VoteType name="Provisional" votes="17">     
          <Precinct name="101" votes="0"/>
        </VoteType>   
      </Choice>
</Contest>


<ElectionResult>
    <Contest key="20000">
        <Choice key="40">
            <VoteType name="Election Day" votes="">
                <County name="" votes="" />
                ...
            </VoteType>
            <VoteType name="Absentee by Mail" votes="">
                <County name="" votes="" />
                ...
            </VoteType>
            <VoteType name="Advance in Person" votes="">
                <County name="" votes="" />
                ...
            </VoteType>
        </Choice>
    </Contest>
</ElectionResult>

In [4]:
# Contest key=20000 is Governor, Choice key=40 is Kemp

elec.xpath("sum(Contest[@key='20000']/Choice[@key='40']//County/@votes)")

# elec.xpath("Contest[@text='Governor']/Choice[@text='BRIAN KEMP  (REP)']//County[@votes]")
# elec.find('//Precinct')

1978408.0

# Create a CSV file with total ballots cast in each county

Verified matching between CSV, XML, and GA website on 1/4/2019 by KO:
* totals for Appling, Atkinson, Bacon, Baker, Baldwin match
* total ballots cast statewide 3,949,905 matches

In [5]:
totals = elec.xpath("ElectionVoterTurnout/Counties/County")

# open a file for writing

total_ballots_by_county = open('../../Data/total_ballots_by_county.csv', 'w')
total = 0

# create the csv writer object

csvwriter = csv.writer(total_ballots_by_county)

csvwriter.writerow(["County", "Ballots cast"])
for v in totals:
    csvwriter.writerow([v.attrib["name"], v.attrib["ballotsCast"]])
    total += int(v.attrib["ballotsCast"])

# close the file

total_ballots_by_county.close()

# assert that the total by county, summed, equals the reported total

assert total == int(elec.xpath("ElectionVoterTurnout/@ballotsCast")[0])

## Create a CSV file with reported votes by county by contest by candidate

Verified matching between CSV, XML, and GA website on 1/4/2019:

* Votes for Kemp in Bryan county, broken out by vote type, match in CSV and XML. The sum matches the website total 10,507.
* Votes for Geoff Duncan (Lt. Gov) in Ware county, broken out by vote type, match in CSV and XML. The sum matches the website total 7,619.

In [6]:
# open a file for writing

votes = open('../../Data/votes_by_candidate_county.csv', 'w')

# create the csv writer object

csvwriter = csv.writer(votes)
csvwriter.writerow(["Contest", "Candidate", "Vote type", "County", "Ballots cast"])

for contest in e_root.iter("Contest"):
    contest_name = contest.attrib["text"]
    for choice in contest.iter("Choice"):
        candidate = choice.attrib["text"]
        for votetype in choice.iter("VoteType"):
            val = votetype.attrib["name"]
            for v in votetype:
                csvwriter.writerow([contest_name, candidate, val, v.attrib["name"], v.attrib["votes"]])

# close the file

votes.close()

## Create CSV file with undervotes in the down-ticket statewide contests

Undervotes are counted *relative* to the number of ballots cast in the Governor's race. (We have total voter turnout, but it isn't broken out by vote type. Ballots cast for Governor are broken out by vote type.)

Checked on 1/4/19:
* No negative undervote counts
* For each Vote Type in each County, the Total Ballots is the same for every Contest

In [7]:
total_ballots_cast = pd.read_csv('../../Data/total_ballots_by_county.csv')
reported_votes = pd.read_csv('../../Data/votes_by_candidate_county.csv')

In [8]:
reported_votes.head()

Unnamed: 0,Contest,Candidate,Vote type,County,Ballots cast
0,Governor,BRIAN KEMP (REP),Election Day,Appling,2334
1,Governor,BRIAN KEMP (REP),Election Day,Atkinson,808
2,Governor,BRIAN KEMP (REP),Election Day,Bacon,609
3,Governor,BRIAN KEMP (REP),Election Day,Baker,409
4,Governor,BRIAN KEMP (REP),Election Day,Baldwin,3054


In [9]:
statewide_contests = np.array(["Governor", "Lieutenant Governor", "Secretary Of State", \
                               "Attorney General", "Commissioner Of Agriculture", \
                               "Commissioner Of Insurance", "State School Superintendent", "Commissioner Of Labor", \
                               "Public Service Commission, District 3 - Metro-Atlanta", \
                               "Public Service Commission, District 5 - Western"])
reported_votes = reported_votes[reported_votes["Contest"].isin(statewide_contests)]

reported_votes_by_contest = reported_votes.groupby(["Contest", "County", "Vote type"])["Ballots cast"].agg(np.sum)
reported_votes_by_contest = reported_votes_by_contest.reset_index()
reported_votes_by_contest.head()

Unnamed: 0,Contest,County,Vote type,Ballots cast
0,Attorney General,Appling,Absentee by Mail,519
1,Attorney General,Appling,Advance in Person,3180
2,Attorney General,Appling,Election Day,2860
3,Attorney General,Appling,Provisional,3
4,Attorney General,Atkinson,Absentee by Mail,88


In [10]:
gov_race = reported_votes_by_contest[reported_votes_by_contest["Contest"]=="Governor"]
gov_race = gov_race.copy()
gov_race.rename(columns={'Ballots cast': 'Total ballots'}, inplace=True)
gov_race = gov_race.drop(columns=["Contest"])
gov_race.head()

Unnamed: 0,County,Vote type,Total ballots
2544,Appling,Absentee by Mail,530
2545,Appling,Advance in Person,3298
2546,Appling,Election Day,2978
2547,Appling,Provisional,3
2548,Atkinson,Absentee by Mail,88


In [11]:
max_votes_estimated = reported_votes_by_contest.groupby(["County", "Vote type"]).agg(np.max)
max_votes_estimated = max_votes_estimated.drop(columns=["Contest"]).reset_index()
max_votes_estimated.rename(columns={'Ballots cast': 'Total ballots'}, inplace=True)

In [12]:
merged_votes = pd.DataFrame()
for contest in statewide_contests[1:]:
    this_race = reported_votes_by_contest["Contest"]==contest
    merged_votes_contest = pd.merge(max_votes_estimated, reported_votes_by_contest[this_race])
    merged_votes_contest["Undervotes"] = merged_votes_contest["Total ballots"] - merged_votes_contest["Ballots cast"]
    merged_votes = pd.concat([merged_votes, merged_votes_contest])
    
merged_votes.head()

Unnamed: 0,County,Vote type,Total ballots,Contest,Ballots cast,Undervotes
0,Appling,Absentee by Mail,530,Lieutenant Governor,523,7
1,Appling,Advance in Person,3298,Lieutenant Governor,3092,206
2,Appling,Election Day,2978,Lieutenant Governor,2768,210
3,Appling,Provisional,3,Lieutenant Governor,3,0
4,Atkinson,Absentee by Mail,88,Lieutenant Governor,88,0


In [13]:
merged_votes.to_csv('../../Data/undervotes_by_county.csv', index=False)

In [14]:
# version information
%load_ext version_information
%version_information scipy, numpy, csv, pandas, matplotlib, notebook, cryptorandom, permute

Loading extensions from ~/.ipython/extensions is deprecated. We recommend managing extensions like any other Python packages, in site-packages.




Software,Version
Python,3.6.7 64bit [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
IPython,7.2.0
OS,Darwin 18.2.0 x86_64 i386 64bit
scipy,1.1.0
numpy,1.15.4
csv,1.0
pandas,0.23.1
matplotlib,3.0.2
notebook,5.7.4
cryptorandom,0.2
