Datasource for Washington

First pass at implementing the datasource for Washington state. This requires a url_paths.csv file and a custom fetch class because some precinct-level results are archived in ZIP files. The custom fields in url_paths.csv needed to work around the data quirks are documented in openelex/us/wa/mappings/README.md. PDF results still need to be converted and there are some cases where there are multiple precinct-level files and it is unclear which one to use. Addresses #145
openelections · Jun 5, 2014 · 57d87b6 · 57d87b6
1 parent fbc7031
commit 57d87b6
Show file tree

Hide file tree

Showing 7 changed files with 761 additions and 0 deletions.
diff --git a/openelex/tests/test_wa_datasource.py b/openelex/tests/test_wa_datasource.py
@@ -0,0 +1,17 @@
+from unittest import TestCase
+
+from openelex.us.wa.datasource import Datasource
+
+class TestDatasource(TestCase):
+    def setUp(self):
+        self.datasource = Datasource()
+
+    def test_reporting_level_from_url(self):
+        urls = [
+            ("https://wei.sos.wa.gov/agency/osos/en/press_and_research/PreviousElections/2007/Primary/Documents/2007Prim%20Statewide%20Results_FINAL.xls", 'state'),
+            ("https://wei.sos.wa.gov/agency/osos/en/press_and_research/PreviousElections/2007/Primary/Documents/2007Prim%20County%20Results.xls", 'county'),
+        ]
+
+        for url, expected in urls:
+            reporting_level = self.datasource._reporting_level_from_url(url)
+            self.assertEqual(reporting_level, expected)
diff --git a/openelex/us/wa/__init__.py b/openelex/us/wa/__init__.py
diff --git a/openelex/us/wa/datasource.py b/openelex/us/wa/datasource.py
@@ -0,0 +1,245 @@
+from os.path import splitext
+import urlparse
+
+from openelex.base.datasource import BaseDatasource
+from openelex.lib import build_github_url
+from openelex.lib.text import ocd_type_id
+
+class Datasource(BaseDatasource):
+    def mappings(self, year=None):
+        mappings = []
+        for yr, elecs in self.elections(year).items():
+            mappings.extend(self._build_metadata(yr, elecs))
+        return mappings
+
+    def mappings_for_url(self, url):
+        return [mapping for mapping in self.mappings() if mapping['raw_url'] == url]
+
+    def filename_url_pairs(self, year=None):
+        return [(mapping['generated_filename'], self._url_for_fetch(mapping)) 
+                for mapping in self.mappings(year)]
+
+    def unprocessed_filename_url_pairs(self, year=None):
+        return [(mapping['generated_filename'].replace(".csv", ".pdf"), mapping['raw_url'])
+                for mapping in self.mappings(year)
+                if 'pre_processed_url' in mapping]
+
+    def _url_for_fetch(self, mapping):
+        try:
+            return mapping['pre_processed_url']
+        except KeyError: 
+            return mapping['raw_url']
+
+    def _build_metadata(self, year, elections):
+        meta_entries = []
+
+        for election in elections:
+            slug = election['slug']
+            year = int(election['start_date'].split('-')[0])
+
+            if year <= 2006:
+                meta_entries.extend(self._build_metadata_preprocessed(election))
+            elif slug == 'wa-2007-08-21-primary':
+                meta_entries.extend(self._build_metadata_direct_links(election))
+            elif (slug == 'wa-2007-11-06-general' or 
+                  (year >= 2008 and year <= 2011)):
+                if slug == 'wa-2011-08-16-primary':
+                    # The 2011-08-16 election doesn't have any contests of interest for
+                    # OpenElections
+                    continue
+
+                meta_entries.extend(self._build_metadata_state_county(election))
+                meta_entries.extend(self._build_metadata_url_paths(election))
+
+            elif year >= 2012 and year <= 2013:
+                meta_entries.extend(self._build_metadata_url_paths(election))
+            else:
+                msg = ("Not sure how to define mappings for election {}.  "
+                       "Please update openelex.us.wa.datasource").format(slug)
+                raise NotImplemented(msg)
+
+        return meta_entries
+
+    def _build_metadata_preprocessed(self, election):
+        """Return election metadata for an election with preprocessed results"""
+        generated_filename = self._standardized_filename(election,
+            extension=".csv")
+        return [
+            {
+                'generated_filename': generated_filename,
+                'raw_url': build_github_url('wa', generated_filename),
+                'ocd_id': 'ocd-division/country:us/state:wa',
+                'name': "Washington", 
+                'election': election['slug'],
+            }
+        ]
+
+    def _build_metadata_direct_links(self, election):
+        """Return election metadata based on direct_links"""
+        meta_entries = []
+
+        for url in election['direct_links']:
+            filename_kwargs = {
+                'extension': self._filename_extension(url),
+            }
+            reporting_level = self._reporting_level_from_url(url)
+            if reporting_level != 'state':
+                filename_kwargs['reporting_level'] = reporting_level
+            generated_filename = self._standardized_filename(election,
+                **filename_kwargs)
+            meta_entries.append({
+                'generated_filename': generated_filename,
+                'raw_url': url,
+                'ocd_id': 'ocd-division/country:us/state:wa',
+                'name': "Washington", 
+                'election': election['slug'],
+            })
+
+        return meta_entries
+
+    def _reporting_level_from_url(self, url):
+        parts = urlparse.urlparse(url)
+        root, ext = splitext(parts.path)
+        root_lower = root.lower()
+        if "county" in root_lower:
+            return 'county'
+        else:
+            return 'state'
+
+    def _state_county_csv_results_url(self, election, name):
+        url_tpl = "http://vote.wa.gov/results/{}/export/{}_{}.csv"
+        date_str = election['start_date'].replace('-', '')
+        return url_tpl.format(date_str, date_str, name.replace(' ', ''))
+
+    def _build_metadata_state_county(self, election, extra_statewide=None,
+            office=None):
+        """
+        Generate mappings for the statewide and county CSV files.
+        
+        This method builds mappings for elections from 2007-2011 that
+        have URLs like
+        http://vote.wa.gov/results/YYYYMMDD/export/YYYYMMDD_CountyName.csv
+
+        Elections starting in 2012 have very similar results portals.  They
+        also provide all county results in a single CSV.  Finally, they
+        provide precinct-level CSV data for some counties.  Unfortunately,
+        the URLs have a trailing numeric identifier, which doesn't seem to be
+        able to be predetermined.  For example the "1451" in
+        http://vote.wa.gov/results/20121106/export/20121106_AllCounties_20121205_1451.csv
+        
+        Just handle these in url_paths.csv.
+
+        Args:
+            election: Election dict as returned by the Metadata API.
+            extra_statewide: Array of extra names of statewide files.
+            office: Office slug if the results are for a single office, e.g. the
+                Presidential primary.
+        """
+        meta_entries = []
+
+        for county in self._counties():
+            generated_filename = self._standardized_filename(election,
+                extension=".csv", reporting_level='county',
+                jurisdiction=county['name'], office=office)
+            meta_entries.append({
+                'generated_filename': generated_filename,
+                'raw_url': self._state_county_csv_results_url(election, county['name']),
+                'ocd_id': county['ocd_id'],
+                'name': county['name'],
+                'election': election['slug'],
+            })
+
+        # There's also a statewide results file that uses the same
+        # URL format, but uses "AllState" instead of the county name.  
+        # Include it in the mappings also.
+        if extra_statewide is None:
+            extra_statewide = ["AllState"]
+        else:
+            extra_statewide.append("AllState")
+
+        for name in extra_statewide:
+            filename_kwargs = {
+                'extension': ".csv",
+                'office': office,
+            }
+            meta_entries.append({
+                'generated_filename': self._standardized_filename(election,
+                    **filename_kwargs),
+                'raw_url': self._state_county_csv_results_url(election, name),
+                'ocd_id': 'ocd-division/country:us/state:wa',
+                'name': "Washington", 
+                'election': election['slug'],
+            })
+
+        return meta_entries
+
+    def _parse_url_path(self, row):
+        clean_row = super(Datasource, self)._parse_url_path(row)
+        # Convert "TRUE" strings to boolean
+        clean_row['skip'] = clean_row['skip'].upper() == "TRUE"
+        return clean_row
+
+    def _build_metadata_url_paths(self, election):
+        """Return mappings for result files from url_paths.csv"""
+        meta_entries = []
+        # Exclude paths with the ``skip`` flag set in the mappings
+        url_paths = [url_path for url_path in self._url_paths_for_election(election)
+                     if not url_path['skip']]
+
+        for url_path in url_paths:
+            pdf_result = False
+            filename_ext = self._filename_extension_for_url_path(url_path)
+            # We'll eventually preprocess PDFs and convert them to CSVs.
+            # So, the downloaded file will be a CSV.  Set the filename
+            # extension accordingly.
+            if filename_ext == ".pdf":
+                filename_ext = ".csv"
+                pdf_result = True
+
+            filename_kwargs = {
+                'extension': filename_ext, 
+                'reporting_level': url_path['reporting_level'],
+                'jurisdiction': url_path['jurisdiction'],
+                'party': url_path['party'],
+            }
+            generated_filename = self._standardized_filename(election,
+                **filename_kwargs)
+
+            mapping = {
+                'generated_filename': generated_filename,
+                'raw_url': url_path['url'], 
+                'ocd_id': self._ocd_id_for_url_path(url_path),
+                'name': url_path['jurisdiction'],
+                'election': election['slug'],
+                'raw_extracted_filename': url_path['raw_extracted_filename'],
+                'parent_zipfile': url_path['parent_zipfile'],
+            }
+
+            if pdf_result:
+                mapping['pre_processed_url'] = build_github_url(self.state,
+                    generated_filename)
+
+            meta_entries.append(mapping)
+
+        return meta_entries
+
+    def _filename_extension_for_url_path(self, url_path):
+        # By default, just return an extension from the filename part of the
+        # URL
+        path = url_path['url'] 
+        # But if we have to extract the filename from a zip file, use the
+        # extracted filename's extension.
+        if url_path['raw_extracted_filename']:
+            path = url_path['raw_extracted_filename']
+        return self._filename_extension(path)
+
+    def _ocd_id_for_url_path(self, url_path):
+        # This method is needed because there can be a url path for either
+        # a single, statewide file or a file that contains results for only
+        # one county.
+        ocd_id = "ocd-division/country:us/state:wa"
+        if url_path['jurisdiction']:
+            # A jurisdiction is specified, which means that results are
+            # broken down per-county
+            ocd_id = "{}/county:{}".format(ocd_id, ocd_type_id(url_path['jurisdiction']))
+        return ocd_id
diff --git a/openelex/us/wa/fetch.py b/openelex/us/wa/fetch.py
@@ -0,0 +1,77 @@
+import os.path
+import urlparse
+from zipfile import ZipFile
+
+from openelex.base.fetch import BaseFetcher
+from openelex.us.wa.datasource import Datasource
+
+class FetchResults(BaseFetcher):
+    def __init__(self):
+        super(FetchResults, self).__init__()
+        self._fetched = set()
+        # We need access to the state datasource to be able to retrieve
+        # mappings for a specific URL in the case of zip files since multiple
+        # extracted files will come from the same URL.
+        self._datasource = Datasource()
+
+    def fetch(self, url, fname=None, overwrite=False):
+        # We keep track of URLs we've already fetched in this run since
+        # there will be multiple output files mapped to a single zip
+        # file.  If we've already fetched this URL, exit early.
+        if url in self._fetched:
+            return
+
+        if url.endswith('.zip'):
+            # Fetch the zip file, using the automatically generated filename
+            zip_fname = self._local_zip_file_name(url)
+            super(FetchResults, self).fetch(url, zip_fname, overwrite)
+            self._extract_zip(url, zip_fname, overwrite)
+        else:
+            super(FetchResults, self).fetch(url, fname, overwrite)
+
+        self._fetched.add(url)
+
+    def _local_zip_file_name(self, url):
+        """
+        Return a normalized local file name for a results zip file.
+
+        We don't care too much about the format because we can delete the
+        zip file later.
+        """
+        parsed = urlparse.urlsplit(url)
+        fname = parsed.path.split('/')[-1]
+        return os.path.join(self.cache.abspath, fname)
+
+    def _extract_zip(self, url, zip_fname=None, overwrite=False, remove=True):
+        if zip_fname is None:
+            zip_fname =  self._local_zip_file_name(url)
+
+        with ZipFile(zip_fname, 'r') as zipf:
+            for mapping in self._datasource.mappings_for_url(url):
+                local_file_name = os.path.join(self.cache.abspath,
+                    mapping['generated_filename'])
+                if overwrite or not os.path.exists(local_file_name):
+                    if mapping['parent_zipfile']:
+                        # The downloaded ZIP archive contains zip files. We
+                        # need to extract the nested zip file.
+                        zipf.extract(mapping['parent_zipfile'],
+                            self.cache.abspath)
+                        parent_zipfile_path = os.path.join(self.cache.abspath,
+                            mapping['parent_zipfile'])
+                        with ZipFile(parent_zipfile_path, 'r') as parent_zipf:
+                            parent_zipf.extract(mapping['raw_extracted_filename'],
+                                    self.cache.abspath)
+                        # TODO: Delete the nested zip file?
+
+                    else:
+                        zipf.extract(mapping['raw_extracted_filename'],
+                            self.cache.abspath)
+                    extracted_file_name = os.path.join(self.cache.abspath,
+                        mapping['raw_extracted_filename'])
+                    os.rename(extracted_file_name, local_file_name)
+                    print "Added to cache: %s" % local_file_name
+                else:
+                    print "File is cached: %s" % local_file_name
+
+        if remove:
+            os.remove(zip_fname)
diff --git a/openelex/us/wa/mappings/README.md b/openelex/us/wa/mappings/README.md
@@ -0,0 +1,16 @@
+## url\_paths.csv
+
+In addition to the fields found in other states, I added some extra columns.
+
+The main reason for these fields is that there are some elections where there are precint-level results, but for many counties, there aren't any offices of interest.  I wanted to have a record that the files existed, but a mechanism to exlcude them from the datasource mappings. 
+
+The additional columns are:
+
+* filename: Raw filename.  This was added in case we need to regenerate the URLs somehow or to avoid URL parsing.
+* has\_statwide\_results: Does this file contain statewide results that OpenElections is interested in?
+* skip: Should this file be skipped when defining datasource mappings?
+* needs\_preprocessing: File needs to be preprocessed before it can be loaded, usually because it's a PDF.
+* raw\_extracted\_filename: File within an archive that will ultimately be extracted and saved to the cache.
+* parent\_zipfile: Some results zips have two-levels of zip files.  We need to know the extracted filenames parent zip archive to be able to properly cache the file.
+
+