In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

In [2]:
def attrib_extractor(xml, attrib):
    return xml.attrib.get(attrib, None)

In [3]:
tree = ET.parse('raw_xml/20180213__ga__special.xml')
root = tree.getroot()

In [4]:
results = []
contests = root.findall('Contest')
for contest in contests:
    race = attrib_extractor(contest, 'text')
    choices = contest.findall('Choice')
    for choice in choices:
        candidate_party = attrib_extractor(choice, 'text')
        vote_types = choice.findall('VoteType')
        for v in vote_types:
            vote_type = attrib_extractor(v, 'name')        
            for child in v:
                line = {}    
                line['race'] = race
                line['candidate_party'] = candidate_party
                line['vote_type'] = vote_type
                line['county'] = attrib_extractor(child, 'name')
                line['votes'] = attrib_extractor(child, 'votes')
                results.append(line)

In [5]:
df = pd.DataFrame(results)

In [6]:
df.head()

Unnamed: 0,candidate_party,county,race,vote_type,votes
0,TREVA GEAR (DEM),Brooks,"State Representative, District 175",Election Day,170
1,TREVA GEAR (DEM),Lowndes,"State Representative, District 175",Election Day,168
2,TREVA GEAR (DEM),Thomas,"State Representative, District 175",Election Day,139
3,TREVA GEAR (DEM),Brooks,"State Representative, District 175",Absentee by Mail,16
4,TREVA GEAR (DEM),Lowndes,"State Representative, District 175",Absentee by Mail,4


In [7]:
df.shape

(48, 5)

In [8]:
df['votes'] = df['votes'].astype(int)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 5 columns):
candidate_party    48 non-null object
county             48 non-null object
race               48 non-null object
vote_type          48 non-null object
votes              48 non-null int64
dtypes: int64(1), object(4)
memory usage: 2.0+ KB


In [10]:
df.groupby(['candidate_party'])['votes'].sum()

candidate_party
BRUCE PHELPS (REP)      75
COY REAVES (REP)       117
JOHN LAHOOD (REP)     2355
TREVA GEAR (DEM)       784
Name: votes, dtype: int64

In [11]:
df.groupby(['race'])['votes'].sum()

race
State Representative, District 175    3331
Name: votes, dtype: int64

In [12]:
df['candidate'] = df['candidate_party'].str[:-5]
df.loc[df['candidate_party'].str.contains('(REP)', regex=False), 'party'] = 'Republican'
df.loc[df['candidate_party'].str.contains('(DEM)', regex=False), 'party'] = 'Democrat'

In [13]:
df['office'] = df.race.str.split(',').str.get(0)
df['district'] = df.race.str.split(',').str.get(1).str.replace('District', '')

In [14]:
df.groupby(['county', 'office', 'district', 'candidate', 'party'], as_index=False)['votes'].sum()

Unnamed: 0,county,office,district,candidate,party,votes
0,Brooks,State Representative,175,BRUCE PHELPS,Republican,27
1,Brooks,State Representative,175,COY REAVES,Republican,72
2,Brooks,State Representative,175,JOHN LAHOOD,Republican,901
3,Brooks,State Representative,175,TREVA GEAR,Democrat,262
4,Lowndes,State Representative,175,BRUCE PHELPS,Republican,35
5,Lowndes,State Representative,175,COY REAVES,Republican,23
6,Lowndes,State Representative,175,JOHN LAHOOD,Republican,1030
7,Lowndes,State Representative,175,TREVA GEAR,Democrat,303
8,Thomas,State Representative,175,BRUCE PHELPS,Republican,13
9,Thomas,State Representative,175,COY REAVES,Republican,22


In [15]:
results = df.groupby(['county', 'office', 'district', 'candidate', 'party'], as_index=False)['votes'].sum()

In [16]:
results['office'] = results.office.str.strip()
results['district'] = results.district.str.strip()
results['candidate'] = results.candidate.str.strip()

In [17]:
sort_columns = ['office', 'district', 'candidate', 'county']
results[['county', 'office', 'district', 'party', 'candidate', 'votes']].sort_values(by=sort_columns).to_clipboard(index=False, sep=',')