In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

In [9]:
def attrib_extractor(xml, attrib):
    return xml.attrib.get(attrib, None)

In [10]:
tree = ET.parse('raw_xml/20180109__ga__special.xml')
root = tree.getroot()

In [11]:
results = []
contests = root.findall('Contest')
for contest in contests:
    race = attrib_extractor(contest, 'text')
    choices = contest.findall('Choice')
    for choice in choices:
        candidate_party = attrib_extractor(choice, 'text')
        vote_types = choice.findall('VoteType')
        for v in vote_types:
            vote_type = attrib_extractor(v, 'name')        
            for child in v:
                line = {}    
                line['race'] = race
                line['candidate_party'] = candidate_party
                line['vote_type'] = vote_type
                line['county'] = attrib_extractor(child, 'name')
                line['votes'] = attrib_extractor(child, 'votes')
                results.append(line)

In [12]:
df = pd.DataFrame(results)

In [13]:
df.head()

Unnamed: 0,candidate_party,county,race,vote_type,votes
0,PHYLLIS HATCHER (DEM),Henry,"State Senator, District 17",Election Day,1472
1,PHYLLIS HATCHER (DEM),Newton,"State Senator, District 17",Election Day,313
2,PHYLLIS HATCHER (DEM),Rockdale,"State Senator, District 17",Election Day,333
3,PHYLLIS HATCHER (DEM),Henry,"State Senator, District 17",Absentee by Mail,101
4,PHYLLIS HATCHER (DEM),Newton,"State Senator, District 17",Absentee by Mail,2


In [14]:
df.shape

(64, 5)

In [15]:
df['votes'] = df['votes'].astype(int)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 5 columns):
candidate_party    64 non-null object
county             64 non-null object
race               64 non-null object
vote_type          64 non-null object
votes              64 non-null int64
dtypes: int64(1), object(4)
memory usage: 2.6+ KB


In [17]:
df.groupby(['candidate_party'])['votes'].sum()

candidate_party
BRIAN STRICKLAND (REP)      5003
ED TONEY (REP)               198
EL-MAHDI HOLLY (DEM)        1122
GEOFFREY CAUBLE (REP)       1864
LARRY K. MOREY (REP)         215
NELVA LEE (REP)               97
PHYLLIS HATCHER (DEM)       2774
TARJI LEONARD DUNN (DEM)     451
Name: votes, dtype: int64

In [18]:
df.groupby(['race'])['votes'].sum()

race
State Representative, District 111    3652
State Senator, District 17            8072
Name: votes, dtype: int64

In [19]:
df['candidate'] = df['candidate_party'].str[:-5]
df.loc[df['candidate_party'].str.contains('(REP)', regex=False), 'party'] = 'Republican'
df.loc[df['candidate_party'].str.contains('(DEM)', regex=False), 'party'] = 'Democrat'

In [20]:
df['office'] = df.race.str.split(',').str.get(0)
df['district'] = df.race.str.split(',').str.get(1).str.replace('District', '')

In [22]:
df.groupby(['county', 'office', 'district', 'candidate', 'party'], as_index=False)['votes'].sum()

Unnamed: 0,county,office,district,candidate,party,votes
0,Henry,State Representative,111,EL-MAHDI HOLLY,Democrat,1122
1,Henry,State Representative,111,GEOFFREY CAUBLE,Republican,1864
2,Henry,State Representative,111,LARRY K. MOREY,Republican,215
3,Henry,State Representative,111,TARJI LEONARD DUNN,Democrat,451
4,Henry,State Senator,17,BRIAN STRICKLAND,Republican,3637
5,Henry,State Senator,17,ED TONEY,Republican,160
6,Henry,State Senator,17,NELVA LEE,Republican,82
7,Henry,State Senator,17,PHYLLIS HATCHER,Democrat,1973
8,Newton,State Senator,17,BRIAN STRICKLAND,Republican,895
9,Newton,State Senator,17,ED TONEY,Republican,27


In [23]:
results = df.groupby(['county', 'office', 'district', 'candidate', 'party'], as_index=False)['votes'].sum()

In [27]:
results['office'] = results.office.str.strip()
results['district'] = results.district.str.strip()
results['candidate'] = results.candidate.str.strip()

In [30]:
sort_columns = ['office', 'district', 'candidate', 'county']
results[['county', 'office', 'district', 'party', 'candidate', 'votes']].sort_values(by=sort_columns).to_clipboard(index=False, sep=',')