-
Notifications
You must be signed in to change notification settings - Fork 95
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
First pass at implementing the datasource for Washington state. This requires a url_paths.csv file and a custom fetch class because some precinct-level results are archived in ZIP files. The custom fields in url_paths.csv needed to work around the data quirks are documented in openelex/us/wa/mappings/README.md. PDF results still need to be converted and there are some cases where there are multiple precinct-level files and it is unclear which one to use. Addresses #145
- Loading branch information
Showing
7 changed files
with
761 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from unittest import TestCase | ||
|
||
from openelex.us.wa.datasource import Datasource | ||
|
||
class TestDatasource(TestCase): | ||
def setUp(self): | ||
self.datasource = Datasource() | ||
|
||
def test_reporting_level_from_url(self): | ||
urls = [ | ||
("https://wei.sos.wa.gov/agency/osos/en/press_and_research/PreviousElections/2007/Primary/Documents/2007Prim%20Statewide%20Results_FINAL.xls", 'state'), | ||
("https://wei.sos.wa.gov/agency/osos/en/press_and_research/PreviousElections/2007/Primary/Documents/2007Prim%20County%20Results.xls", 'county'), | ||
] | ||
|
||
for url, expected in urls: | ||
reporting_level = self.datasource._reporting_level_from_url(url) | ||
self.assertEqual(reporting_level, expected) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,245 @@ | ||
from os.path import splitext | ||
import urlparse | ||
|
||
from openelex.base.datasource import BaseDatasource | ||
from openelex.lib import build_github_url | ||
from openelex.lib.text import ocd_type_id | ||
|
||
class Datasource(BaseDatasource): | ||
def mappings(self, year=None): | ||
mappings = [] | ||
for yr, elecs in self.elections(year).items(): | ||
mappings.extend(self._build_metadata(yr, elecs)) | ||
return mappings | ||
|
||
def mappings_for_url(self, url): | ||
return [mapping for mapping in self.mappings() if mapping['raw_url'] == url] | ||
|
||
def filename_url_pairs(self, year=None): | ||
return [(mapping['generated_filename'], self._url_for_fetch(mapping)) | ||
for mapping in self.mappings(year)] | ||
|
||
def unprocessed_filename_url_pairs(self, year=None): | ||
return [(mapping['generated_filename'].replace(".csv", ".pdf"), mapping['raw_url']) | ||
for mapping in self.mappings(year) | ||
if 'pre_processed_url' in mapping] | ||
|
||
def _url_for_fetch(self, mapping): | ||
try: | ||
return mapping['pre_processed_url'] | ||
except KeyError: | ||
return mapping['raw_url'] | ||
|
||
def _build_metadata(self, year, elections): | ||
meta_entries = [] | ||
|
||
for election in elections: | ||
slug = election['slug'] | ||
year = int(election['start_date'].split('-')[0]) | ||
|
||
if year <= 2006: | ||
meta_entries.extend(self._build_metadata_preprocessed(election)) | ||
elif slug == 'wa-2007-08-21-primary': | ||
meta_entries.extend(self._build_metadata_direct_links(election)) | ||
elif (slug == 'wa-2007-11-06-general' or | ||
(year >= 2008 and year <= 2011)): | ||
if slug == 'wa-2011-08-16-primary': | ||
# The 2011-08-16 election doesn't have any contests of interest for | ||
# OpenElections | ||
continue | ||
|
||
meta_entries.extend(self._build_metadata_state_county(election)) | ||
meta_entries.extend(self._build_metadata_url_paths(election)) | ||
|
||
elif year >= 2012 and year <= 2013: | ||
meta_entries.extend(self._build_metadata_url_paths(election)) | ||
else: | ||
msg = ("Not sure how to define mappings for election {}. " | ||
"Please update openelex.us.wa.datasource").format(slug) | ||
raise NotImplemented(msg) | ||
|
||
return meta_entries | ||
|
||
def _build_metadata_preprocessed(self, election): | ||
"""Return election metadata for an election with preprocessed results""" | ||
generated_filename = self._standardized_filename(election, | ||
extension=".csv") | ||
return [ | ||
{ | ||
'generated_filename': generated_filename, | ||
'raw_url': build_github_url('wa', generated_filename), | ||
'ocd_id': 'ocd-division/country:us/state:wa', | ||
'name': "Washington", | ||
'election': election['slug'], | ||
} | ||
] | ||
|
||
def _build_metadata_direct_links(self, election): | ||
"""Return election metadata based on direct_links""" | ||
meta_entries = [] | ||
|
||
for url in election['direct_links']: | ||
filename_kwargs = { | ||
'extension': self._filename_extension(url), | ||
} | ||
reporting_level = self._reporting_level_from_url(url) | ||
if reporting_level != 'state': | ||
filename_kwargs['reporting_level'] = reporting_level | ||
generated_filename = self._standardized_filename(election, | ||
**filename_kwargs) | ||
meta_entries.append({ | ||
'generated_filename': generated_filename, | ||
'raw_url': url, | ||
'ocd_id': 'ocd-division/country:us/state:wa', | ||
'name': "Washington", | ||
'election': election['slug'], | ||
}) | ||
|
||
return meta_entries | ||
|
||
def _reporting_level_from_url(self, url): | ||
parts = urlparse.urlparse(url) | ||
root, ext = splitext(parts.path) | ||
root_lower = root.lower() | ||
if "county" in root_lower: | ||
return 'county' | ||
else: | ||
return 'state' | ||
|
||
def _state_county_csv_results_url(self, election, name): | ||
url_tpl = "http://vote.wa.gov/results/{}/export/{}_{}.csv" | ||
date_str = election['start_date'].replace('-', '') | ||
return url_tpl.format(date_str, date_str, name.replace(' ', '')) | ||
|
||
def _build_metadata_state_county(self, election, extra_statewide=None, | ||
office=None): | ||
""" | ||
Generate mappings for the statewide and county CSV files. | ||
This method builds mappings for elections from 2007-2011 that | ||
have URLs like | ||
http://vote.wa.gov/results/YYYYMMDD/export/YYYYMMDD_CountyName.csv | ||
Elections starting in 2012 have very similar results portals. They | ||
also provide all county results in a single CSV. Finally, they | ||
provide precinct-level CSV data for some counties. Unfortunately, | ||
the URLs have a trailing numeric identifier, which doesn't seem to be | ||
able to be predetermined. For example the "1451" in | ||
http://vote.wa.gov/results/20121106/export/20121106_AllCounties_20121205_1451.csv | ||
Just handle these in url_paths.csv. | ||
Args: | ||
election: Election dict as returned by the Metadata API. | ||
extra_statewide: Array of extra names of statewide files. | ||
office: Office slug if the results are for a single office, e.g. the | ||
Presidential primary. | ||
""" | ||
meta_entries = [] | ||
|
||
for county in self._counties(): | ||
generated_filename = self._standardized_filename(election, | ||
extension=".csv", reporting_level='county', | ||
jurisdiction=county['name'], office=office) | ||
meta_entries.append({ | ||
'generated_filename': generated_filename, | ||
'raw_url': self._state_county_csv_results_url(election, county['name']), | ||
'ocd_id': county['ocd_id'], | ||
'name': county['name'], | ||
'election': election['slug'], | ||
}) | ||
|
||
# There's also a statewide results file that uses the same | ||
# URL format, but uses "AllState" instead of the county name. | ||
# Include it in the mappings also. | ||
if extra_statewide is None: | ||
extra_statewide = ["AllState"] | ||
else: | ||
extra_statewide.append("AllState") | ||
|
||
for name in extra_statewide: | ||
filename_kwargs = { | ||
'extension': ".csv", | ||
'office': office, | ||
} | ||
meta_entries.append({ | ||
'generated_filename': self._standardized_filename(election, | ||
**filename_kwargs), | ||
'raw_url': self._state_county_csv_results_url(election, name), | ||
'ocd_id': 'ocd-division/country:us/state:wa', | ||
'name': "Washington", | ||
'election': election['slug'], | ||
}) | ||
|
||
return meta_entries | ||
|
||
def _parse_url_path(self, row): | ||
clean_row = super(Datasource, self)._parse_url_path(row) | ||
# Convert "TRUE" strings to boolean | ||
clean_row['skip'] = clean_row['skip'].upper() == "TRUE" | ||
return clean_row | ||
|
||
def _build_metadata_url_paths(self, election): | ||
"""Return mappings for result files from url_paths.csv""" | ||
meta_entries = [] | ||
# Exclude paths with the ``skip`` flag set in the mappings | ||
url_paths = [url_path for url_path in self._url_paths_for_election(election) | ||
if not url_path['skip']] | ||
|
||
for url_path in url_paths: | ||
pdf_result = False | ||
filename_ext = self._filename_extension_for_url_path(url_path) | ||
# We'll eventually preprocess PDFs and convert them to CSVs. | ||
# So, the downloaded file will be a CSV. Set the filename | ||
# extension accordingly. | ||
if filename_ext == ".pdf": | ||
filename_ext = ".csv" | ||
pdf_result = True | ||
|
||
filename_kwargs = { | ||
'extension': filename_ext, | ||
'reporting_level': url_path['reporting_level'], | ||
'jurisdiction': url_path['jurisdiction'], | ||
'party': url_path['party'], | ||
} | ||
generated_filename = self._standardized_filename(election, | ||
**filename_kwargs) | ||
|
||
mapping = { | ||
'generated_filename': generated_filename, | ||
'raw_url': url_path['url'], | ||
'ocd_id': self._ocd_id_for_url_path(url_path), | ||
'name': url_path['jurisdiction'], | ||
'election': election['slug'], | ||
'raw_extracted_filename': url_path['raw_extracted_filename'], | ||
'parent_zipfile': url_path['parent_zipfile'], | ||
} | ||
|
||
if pdf_result: | ||
mapping['pre_processed_url'] = build_github_url(self.state, | ||
generated_filename) | ||
|
||
meta_entries.append(mapping) | ||
|
||
return meta_entries | ||
|
||
def _filename_extension_for_url_path(self, url_path): | ||
# By default, just return an extension from the filename part of the | ||
# URL | ||
path = url_path['url'] | ||
# But if we have to extract the filename from a zip file, use the | ||
# extracted filename's extension. | ||
if url_path['raw_extracted_filename']: | ||
path = url_path['raw_extracted_filename'] | ||
return self._filename_extension(path) | ||
|
||
def _ocd_id_for_url_path(self, url_path): | ||
# This method is needed because there can be a url path for either | ||
# a single, statewide file or a file that contains results for only | ||
# one county. | ||
ocd_id = "ocd-division/country:us/state:wa" | ||
if url_path['jurisdiction']: | ||
# A jurisdiction is specified, which means that results are | ||
# broken down per-county | ||
ocd_id = "{}/county:{}".format(ocd_id, ocd_type_id(url_path['jurisdiction'])) | ||
return ocd_id |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
import os.path | ||
import urlparse | ||
from zipfile import ZipFile | ||
|
||
from openelex.base.fetch import BaseFetcher | ||
from openelex.us.wa.datasource import Datasource | ||
|
||
class FetchResults(BaseFetcher): | ||
def __init__(self): | ||
super(FetchResults, self).__init__() | ||
self._fetched = set() | ||
# We need access to the state datasource to be able to retrieve | ||
# mappings for a specific URL in the case of zip files since multiple | ||
# extracted files will come from the same URL. | ||
self._datasource = Datasource() | ||
|
||
def fetch(self, url, fname=None, overwrite=False): | ||
# We keep track of URLs we've already fetched in this run since | ||
# there will be multiple output files mapped to a single zip | ||
# file. If we've already fetched this URL, exit early. | ||
if url in self._fetched: | ||
return | ||
|
||
if url.endswith('.zip'): | ||
# Fetch the zip file, using the automatically generated filename | ||
zip_fname = self._local_zip_file_name(url) | ||
super(FetchResults, self).fetch(url, zip_fname, overwrite) | ||
self._extract_zip(url, zip_fname, overwrite) | ||
else: | ||
super(FetchResults, self).fetch(url, fname, overwrite) | ||
|
||
self._fetched.add(url) | ||
|
||
def _local_zip_file_name(self, url): | ||
""" | ||
Return a normalized local file name for a results zip file. | ||
We don't care too much about the format because we can delete the | ||
zip file later. | ||
""" | ||
parsed = urlparse.urlsplit(url) | ||
fname = parsed.path.split('/')[-1] | ||
return os.path.join(self.cache.abspath, fname) | ||
|
||
def _extract_zip(self, url, zip_fname=None, overwrite=False, remove=True): | ||
if zip_fname is None: | ||
zip_fname = self._local_zip_file_name(url) | ||
|
||
with ZipFile(zip_fname, 'r') as zipf: | ||
for mapping in self._datasource.mappings_for_url(url): | ||
local_file_name = os.path.join(self.cache.abspath, | ||
mapping['generated_filename']) | ||
if overwrite or not os.path.exists(local_file_name): | ||
if mapping['parent_zipfile']: | ||
# The downloaded ZIP archive contains zip files. We | ||
# need to extract the nested zip file. | ||
zipf.extract(mapping['parent_zipfile'], | ||
self.cache.abspath) | ||
parent_zipfile_path = os.path.join(self.cache.abspath, | ||
mapping['parent_zipfile']) | ||
with ZipFile(parent_zipfile_path, 'r') as parent_zipf: | ||
parent_zipf.extract(mapping['raw_extracted_filename'], | ||
self.cache.abspath) | ||
# TODO: Delete the nested zip file? | ||
|
||
else: | ||
zipf.extract(mapping['raw_extracted_filename'], | ||
self.cache.abspath) | ||
extracted_file_name = os.path.join(self.cache.abspath, | ||
mapping['raw_extracted_filename']) | ||
os.rename(extracted_file_name, local_file_name) | ||
print "Added to cache: %s" % local_file_name | ||
else: | ||
print "File is cached: %s" % local_file_name | ||
|
||
if remove: | ||
os.remove(zip_fname) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
## url\_paths.csv | ||
|
||
In addition to the fields found in other states, I added some extra columns. | ||
|
||
The main reason for these fields is that there are some elections where there are precint-level results, but for many counties, there aren't any offices of interest. I wanted to have a record that the files existed, but a mechanism to exlcude them from the datasource mappings. | ||
|
||
The additional columns are: | ||
|
||
* filename: Raw filename. This was added in case we need to regenerate the URLs somehow or to avoid URL parsing. | ||
* has\_statwide\_results: Does this file contain statewide results that OpenElections is interested in? | ||
* skip: Should this file be skipped when defining datasource mappings? | ||
* needs\_preprocessing: File needs to be preprocessed before it can be loaded, usually because it's a PDF. | ||
* raw\_extracted\_filename: File within an archive that will ultimately be extracted and saved to the cache. | ||
* parent\_zipfile: Some results zips have two-levels of zip files. We need to know the extracted filenames parent zip archive to be able to properly cache the file. | ||
|
||
|
Oops, something went wrong.