Skip to content

Commit

Permalink
Datasource for Washington
Browse files Browse the repository at this point in the history
First pass at implementing the datasource for Washington state.

This requires a url_paths.csv file and a custom fetch class
because some precinct-level results are archived in ZIP files.

The custom fields in url_paths.csv needed to work around the
data quirks are documented in openelex/us/wa/mappings/README.md.

PDF results still need to be converted and there are some cases
where there are multiple precinct-level files and it is unclear
which one to use.

Addresses #145
  • Loading branch information
ghing committed Jun 5, 2014
1 parent fbc7031 commit 57d87b6
Show file tree
Hide file tree
Showing 7 changed files with 761 additions and 0 deletions.
17 changes: 17 additions & 0 deletions openelex/tests/test_wa_datasource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from unittest import TestCase

from openelex.us.wa.datasource import Datasource

class TestDatasource(TestCase):
def setUp(self):
self.datasource = Datasource()

def test_reporting_level_from_url(self):
urls = [
("https://wei.sos.wa.gov/agency/osos/en/press_and_research/PreviousElections/2007/Primary/Documents/2007Prim%20Statewide%20Results_FINAL.xls", 'state'),
("https://wei.sos.wa.gov/agency/osos/en/press_and_research/PreviousElections/2007/Primary/Documents/2007Prim%20County%20Results.xls", 'county'),
]

for url, expected in urls:
reporting_level = self.datasource._reporting_level_from_url(url)
self.assertEqual(reporting_level, expected)
Empty file added openelex/us/wa/__init__.py
Empty file.
245 changes: 245 additions & 0 deletions openelex/us/wa/datasource.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
from os.path import splitext
import urlparse

from openelex.base.datasource import BaseDatasource
from openelex.lib import build_github_url
from openelex.lib.text import ocd_type_id

class Datasource(BaseDatasource):
def mappings(self, year=None):
mappings = []
for yr, elecs in self.elections(year).items():
mappings.extend(self._build_metadata(yr, elecs))
return mappings

def mappings_for_url(self, url):
return [mapping for mapping in self.mappings() if mapping['raw_url'] == url]

def filename_url_pairs(self, year=None):
return [(mapping['generated_filename'], self._url_for_fetch(mapping))
for mapping in self.mappings(year)]

def unprocessed_filename_url_pairs(self, year=None):
return [(mapping['generated_filename'].replace(".csv", ".pdf"), mapping['raw_url'])
for mapping in self.mappings(year)
if 'pre_processed_url' in mapping]

def _url_for_fetch(self, mapping):
try:
return mapping['pre_processed_url']
except KeyError:
return mapping['raw_url']

def _build_metadata(self, year, elections):
meta_entries = []

for election in elections:
slug = election['slug']
year = int(election['start_date'].split('-')[0])

if year <= 2006:
meta_entries.extend(self._build_metadata_preprocessed(election))
elif slug == 'wa-2007-08-21-primary':
meta_entries.extend(self._build_metadata_direct_links(election))
elif (slug == 'wa-2007-11-06-general' or
(year >= 2008 and year <= 2011)):
if slug == 'wa-2011-08-16-primary':
# The 2011-08-16 election doesn't have any contests of interest for
# OpenElections
continue

meta_entries.extend(self._build_metadata_state_county(election))
meta_entries.extend(self._build_metadata_url_paths(election))

elif year >= 2012 and year <= 2013:
meta_entries.extend(self._build_metadata_url_paths(election))
else:
msg = ("Not sure how to define mappings for election {}. "
"Please update openelex.us.wa.datasource").format(slug)
raise NotImplemented(msg)

return meta_entries

def _build_metadata_preprocessed(self, election):
"""Return election metadata for an election with preprocessed results"""
generated_filename = self._standardized_filename(election,
extension=".csv")
return [
{
'generated_filename': generated_filename,
'raw_url': build_github_url('wa', generated_filename),
'ocd_id': 'ocd-division/country:us/state:wa',
'name': "Washington",
'election': election['slug'],
}
]

def _build_metadata_direct_links(self, election):
"""Return election metadata based on direct_links"""
meta_entries = []

for url in election['direct_links']:
filename_kwargs = {
'extension': self._filename_extension(url),
}
reporting_level = self._reporting_level_from_url(url)
if reporting_level != 'state':
filename_kwargs['reporting_level'] = reporting_level
generated_filename = self._standardized_filename(election,
**filename_kwargs)
meta_entries.append({
'generated_filename': generated_filename,
'raw_url': url,
'ocd_id': 'ocd-division/country:us/state:wa',
'name': "Washington",
'election': election['slug'],
})

return meta_entries

def _reporting_level_from_url(self, url):
parts = urlparse.urlparse(url)
root, ext = splitext(parts.path)
root_lower = root.lower()
if "county" in root_lower:
return 'county'
else:
return 'state'

def _state_county_csv_results_url(self, election, name):
url_tpl = "http://vote.wa.gov/results/{}/export/{}_{}.csv"
date_str = election['start_date'].replace('-', '')
return url_tpl.format(date_str, date_str, name.replace(' ', ''))

def _build_metadata_state_county(self, election, extra_statewide=None,
office=None):
"""
Generate mappings for the statewide and county CSV files.
This method builds mappings for elections from 2007-2011 that
have URLs like
http://vote.wa.gov/results/YYYYMMDD/export/YYYYMMDD_CountyName.csv
Elections starting in 2012 have very similar results portals. They
also provide all county results in a single CSV. Finally, they
provide precinct-level CSV data for some counties. Unfortunately,
the URLs have a trailing numeric identifier, which doesn't seem to be
able to be predetermined. For example the "1451" in
http://vote.wa.gov/results/20121106/export/20121106_AllCounties_20121205_1451.csv
Just handle these in url_paths.csv.
Args:
election: Election dict as returned by the Metadata API.
extra_statewide: Array of extra names of statewide files.
office: Office slug if the results are for a single office, e.g. the
Presidential primary.
"""
meta_entries = []

for county in self._counties():
generated_filename = self._standardized_filename(election,
extension=".csv", reporting_level='county',
jurisdiction=county['name'], office=office)
meta_entries.append({
'generated_filename': generated_filename,
'raw_url': self._state_county_csv_results_url(election, county['name']),
'ocd_id': county['ocd_id'],
'name': county['name'],
'election': election['slug'],
})

# There's also a statewide results file that uses the same
# URL format, but uses "AllState" instead of the county name.
# Include it in the mappings also.
if extra_statewide is None:
extra_statewide = ["AllState"]
else:
extra_statewide.append("AllState")

for name in extra_statewide:
filename_kwargs = {
'extension': ".csv",
'office': office,
}
meta_entries.append({
'generated_filename': self._standardized_filename(election,
**filename_kwargs),
'raw_url': self._state_county_csv_results_url(election, name),
'ocd_id': 'ocd-division/country:us/state:wa',
'name': "Washington",
'election': election['slug'],
})

return meta_entries

def _parse_url_path(self, row):
clean_row = super(Datasource, self)._parse_url_path(row)
# Convert "TRUE" strings to boolean
clean_row['skip'] = clean_row['skip'].upper() == "TRUE"
return clean_row

def _build_metadata_url_paths(self, election):
"""Return mappings for result files from url_paths.csv"""
meta_entries = []
# Exclude paths with the ``skip`` flag set in the mappings
url_paths = [url_path for url_path in self._url_paths_for_election(election)
if not url_path['skip']]

for url_path in url_paths:
pdf_result = False
filename_ext = self._filename_extension_for_url_path(url_path)
# We'll eventually preprocess PDFs and convert them to CSVs.
# So, the downloaded file will be a CSV. Set the filename
# extension accordingly.
if filename_ext == ".pdf":
filename_ext = ".csv"
pdf_result = True

filename_kwargs = {
'extension': filename_ext,
'reporting_level': url_path['reporting_level'],
'jurisdiction': url_path['jurisdiction'],
'party': url_path['party'],
}
generated_filename = self._standardized_filename(election,
**filename_kwargs)

mapping = {
'generated_filename': generated_filename,
'raw_url': url_path['url'],
'ocd_id': self._ocd_id_for_url_path(url_path),
'name': url_path['jurisdiction'],
'election': election['slug'],
'raw_extracted_filename': url_path['raw_extracted_filename'],
'parent_zipfile': url_path['parent_zipfile'],
}

if pdf_result:
mapping['pre_processed_url'] = build_github_url(self.state,
generated_filename)

meta_entries.append(mapping)

return meta_entries

def _filename_extension_for_url_path(self, url_path):
# By default, just return an extension from the filename part of the
# URL
path = url_path['url']
# But if we have to extract the filename from a zip file, use the
# extracted filename's extension.
if url_path['raw_extracted_filename']:
path = url_path['raw_extracted_filename']
return self._filename_extension(path)

def _ocd_id_for_url_path(self, url_path):
# This method is needed because there can be a url path for either
# a single, statewide file or a file that contains results for only
# one county.
ocd_id = "ocd-division/country:us/state:wa"
if url_path['jurisdiction']:
# A jurisdiction is specified, which means that results are
# broken down per-county
ocd_id = "{}/county:{}".format(ocd_id, ocd_type_id(url_path['jurisdiction']))
return ocd_id
77 changes: 77 additions & 0 deletions openelex/us/wa/fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import os.path
import urlparse
from zipfile import ZipFile

from openelex.base.fetch import BaseFetcher
from openelex.us.wa.datasource import Datasource

class FetchResults(BaseFetcher):
def __init__(self):
super(FetchResults, self).__init__()
self._fetched = set()
# We need access to the state datasource to be able to retrieve
# mappings for a specific URL in the case of zip files since multiple
# extracted files will come from the same URL.
self._datasource = Datasource()

def fetch(self, url, fname=None, overwrite=False):
# We keep track of URLs we've already fetched in this run since
# there will be multiple output files mapped to a single zip
# file. If we've already fetched this URL, exit early.
if url in self._fetched:
return

if url.endswith('.zip'):
# Fetch the zip file, using the automatically generated filename
zip_fname = self._local_zip_file_name(url)
super(FetchResults, self).fetch(url, zip_fname, overwrite)
self._extract_zip(url, zip_fname, overwrite)
else:
super(FetchResults, self).fetch(url, fname, overwrite)

self._fetched.add(url)

def _local_zip_file_name(self, url):
"""
Return a normalized local file name for a results zip file.
We don't care too much about the format because we can delete the
zip file later.
"""
parsed = urlparse.urlsplit(url)
fname = parsed.path.split('/')[-1]
return os.path.join(self.cache.abspath, fname)

def _extract_zip(self, url, zip_fname=None, overwrite=False, remove=True):
if zip_fname is None:
zip_fname = self._local_zip_file_name(url)

with ZipFile(zip_fname, 'r') as zipf:
for mapping in self._datasource.mappings_for_url(url):
local_file_name = os.path.join(self.cache.abspath,
mapping['generated_filename'])
if overwrite or not os.path.exists(local_file_name):
if mapping['parent_zipfile']:
# The downloaded ZIP archive contains zip files. We
# need to extract the nested zip file.
zipf.extract(mapping['parent_zipfile'],
self.cache.abspath)
parent_zipfile_path = os.path.join(self.cache.abspath,
mapping['parent_zipfile'])
with ZipFile(parent_zipfile_path, 'r') as parent_zipf:
parent_zipf.extract(mapping['raw_extracted_filename'],
self.cache.abspath)
# TODO: Delete the nested zip file?

else:
zipf.extract(mapping['raw_extracted_filename'],
self.cache.abspath)
extracted_file_name = os.path.join(self.cache.abspath,
mapping['raw_extracted_filename'])
os.rename(extracted_file_name, local_file_name)
print "Added to cache: %s" % local_file_name
else:
print "File is cached: %s" % local_file_name

if remove:
os.remove(zip_fname)
16 changes: 16 additions & 0 deletions openelex/us/wa/mappings/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
## url\_paths.csv

In addition to the fields found in other states, I added some extra columns.

The main reason for these fields is that there are some elections where there are precint-level results, but for many counties, there aren't any offices of interest. I wanted to have a record that the files existed, but a mechanism to exlcude them from the datasource mappings.

The additional columns are:

* filename: Raw filename. This was added in case we need to regenerate the URLs somehow or to avoid URL parsing.
* has\_statwide\_results: Does this file contain statewide results that OpenElections is interested in?
* skip: Should this file be skipped when defining datasource mappings?
* needs\_preprocessing: File needs to be preprocessed before it can be loaded, usually because it's a PDF.
* raw\_extracted\_filename: File within an archive that will ultimately be extracted and saved to the cache.
* parent\_zipfile: Some results zips have two-levels of zip files. We need to know the extracted filenames parent zip archive to be able to properly cache the file.


Loading

0 comments on commit 57d87b6

Please sign in to comment.