Skip to content

Commit

Permalink
IN: Handle yet another hosed legislator page.
Browse files Browse the repository at this point in the history
  • Loading branch information
twneale committed Jul 28, 2014
1 parent 610f74e commit db1e42a
Showing 1 changed file with 26 additions and 8 deletions.
34 changes: 26 additions & 8 deletions openstates/in/legislators.py
Expand Up @@ -2,6 +2,7 @@
import datetime
import urlparse
import collections
import contextlib

import lxml.html

Expand Down Expand Up @@ -39,6 +40,19 @@ def scrape(self, chamber, term):
for option in doc.xpath('//optgroup[@id="%s"]/option' % optgroup):
self.scrape_legislator(chamber, term, option)

@contextlib.contextmanager
def scrapelib_settings(self, retry_attempts=0, timeout=0):
# Store existing settings.
_retry_attempts = self.retry_attempts
_timeout = self.timeout
# Override them.
self.retry_attempts = retry_attempts
self.timeout = timeout
yield
# Set the previous values back again.
self.retry_attempts = _retry_attempts
self.timeout = _timeout

def scrape_legislator(self, chamber, term, option):
url = urlparse.urljoin(self.url, option.attrib['value'])
name, party, district = re.split(r'\s*,\s*', option.text.strip())
Expand All @@ -52,15 +66,19 @@ def scrape_legislator(self, chamber, term, option):
leg.add_source(self.url)

# Scrape leg page.
try:
html = self.urlopen(url, timeout=5)
except scrapelib.HTTPError as exc:
# As of July 2014, this only happens when a page has
# gone missing from their varnish server.
if exc.response.status_code is 503:
with self.scrapelib_settings(retry_attempts=0, timeout=0):
try:
html = self.urlopen(url)
except scrapelib.HTTPError as exc:
# As of July 2014, this only happens when a page has
# gone missing from their varnish server.
if exc.response.status_code is 503:
return
except:
# In addition, there's just no acceptable reason for this
# request to fail.
self.logger.warning('Skipping legislator at url: %s' % url)
return
else:
raise exc

doc = lxml.html.fromstring(html)
doc.make_links_absolute(self.url)
Expand Down

0 comments on commit db1e42a

Please sign in to comment.