Permalink
Browse files

Merge remote-tracking branch 'origin/master'

  • Loading branch information...
2 parents 5fedb43 + 1140c39 commit ce23541c9cc8719232c634d77cec0482960908b8 @jamesturk jamesturk committed Aug 31, 2012
Showing with 242 additions and 6 deletions.
  1. +4 −3 openstates/ia/__init__.py
  2. +0 −2 openstates/ia/bills.py
  3. +236 −0 openstates/ia/votes.py
  4. +2 −1 scripts/newsblogs/urls/ca.txt
@@ -26,8 +26,8 @@
'2011-2012': {'display_name': '2011-2012 Regular Session',
'_scraped_name': 'General Assembly: 84',
'number': '84',
- 'start_date': datetime.date(2011,1,10),
- 'end_date': datetime.date(2013,1,13),
+ 'start_date': datetime.date(2011, 1, 10),
+ 'end_date': datetime.date(2013, 1, 13),
},
},
feature_flags=['events', 'influenceexplorer'],
@@ -36,12 +36,13 @@
)
+
def session_list():
from billy.scrape.utils import url_xpath
import re
sessions = url_xpath(
'https://www.legis.iowa.gov/Legislation/Find/findLegislation.aspx',
- "//div[@id='ctl00_ctl00_ctl00_cphMainContent_cphCenterCol_cphCenterCol_ucGASelect_divLinks']/ul/li/a/text()" )
+ "//div[@id='ctl00_ctl00_ctl00_cphMainContent_cphCenterCol_cphCenterCol_ucGASelect_divLinks']/ul/li/a/text()")
sessions = [
re.findall(".*\(", session)[0][:-1].strip()
for session in sessions
View
@@ -46,7 +46,6 @@ def _build_subject_map(self, session):
for bill_id in bill_ids:
self._subjects[bill_id.replace(' ', '')].append(subject)
-
def scrape(self, chamber, session):
self._build_subject_map(session)
@@ -127,7 +126,6 @@ def scrape_bill(self, chamber, session, bill_id, url):
mimetype='application/pdf'
)
-
sponsors = page.xpath("string(//table[2]/tr[3])").strip()
sponsor_re = r'[\w-]+(?:, [A-Z]\.)?(?:,|(?: and)|\.$)'
for sponsor in re.findall(sponsor_re, sponsors):
View
@@ -0,0 +1,236 @@
+# -*- coding: utf8 -*-
+from datetime import datetime
+import re
+import collections
+
+import lxml.etree
+
+from billy.scrape.utils import convert_pdf
+from billy.scrape.votes import VoteScraper, Vote
+
+
+class IAVoteScraper(VoteScraper):
+ state = 'ia'
+
+ def scrape(self, chamber, session):
+
+ getattr(self, 'scrape_%s' % chamber)(session)
+
+ def scrape_lower(self, session):
+ url = 'https://www.legis.iowa.gov/Legislation/journalIndex_House.aspx'
+ html = self.urlopen(url)
+ doc = lxml.html.fromstring(html)
+ doc.make_links_absolute(url)
+ urls = doc.xpath('//a[contains(@href, "DOCS")]/@href')[::-1]
+ for url in urls:
+ _, filename = url.rsplit('/', 1)
+ try:
+ date = datetime.strptime(filename, '%m-%d-%Y.pdf')
+ except ValueError:
+ msg = "%s doesn't smell like a date. Skipping."
+ self.logger.info(msg % filename)
+ self.scrape_journal(url, 'lower', session, date)
+
+ def scrape_upper(self, session):
+ url = 'https://www.legis.iowa.gov/Legislation/journalIndex_Senate.aspx'
+ html = self.urlopen(url)
+ doc = lxml.html.fromstring(html)
+ doc.make_links_absolute(url)
+ urls = doc.xpath('//a[contains(@href, "DOCS")]/@href')[::-1]
+ for url in urls:
+ _, filename = url.rsplit('/', 1)
+ try:
+ date = datetime.strptime(filename, '%m-%d-%Y.pdf')
+ except ValueError:
+ msg = "%s doesn't smell like a date. Skipping."
+ self.logger.info(msg % filename)
+ self.scrape_journal(url, 'upper', session, date)
+
+ def _journal_lines(self, etree):
+ '''A generator of text lines. Skip crap.
+ '''
+ for page in etree:
+ for text in page.xpath('text')[3:]:
+ yield text
+
+ def scrape_journal(self, url, chamber, session, date):
+
+ filename, response = self.urlretrieve(url)
+ self.logger.info('Saved journal to %r' % filename)
+ xml = convert_pdf(filename)
+ try:
+ et = lxml.etree.fromstring(xml)
+ except lxml.etree.XMLSyntaxError:
+ self.logger.warning('Skipping invalid pdf: %r' % filename)
+ return
+
+ lines = self._journal_lines(et)
+ while True:
+ try:
+ line = next(lines)
+ except StopIteration:
+ break
+
+ text = gettext(line)
+
+ # Go through with vote parse if any of
+ # these conditions match.
+ if 'Shall' in text:
+ if 'bill pass?' in text:
+ pass
+ elif 'resolution' in text:
+ pass
+ elif 'amendment' in text:
+ pass
+ else:
+ continue
+ else:
+ continue
+
+ # Get the bill_id.
+ while True:
+ line = next(lines)
+ text += gettext(line)
+ m = re.search(r'\(\s*([A-Z\.]+\s+\d+)\s*\)', text)
+ if m:
+ bill_id = m.group(1)
+ break
+
+ motion = text.strip()
+ motion = re.sub(r'\s+', ' ', motion)
+ motion, _ = motion.rsplit('(')
+ motion = motion.replace('"', '')
+ motion = motion.replace(u'', '')
+ motion = motion.replace(u'\u201d', '')
+ motion = motion.replace(u' ,', ',')
+ motion = motion.strip()
+ motion = re.sub(r'On the question\s*', '', motion, flags=re.I)
+
+ for word, letter in (('Senate', 'S'),
+ ('House', 'H'),
+ ('File', 'F')):
+ bill_id = bill_id.replace(word, letter)
+
+ bill_chamber = dict(h='lower', s='upper')[bill_id.lower()[0]]
+ votes = self.parse_votes(lines)
+ totals = filter(lambda x: isinstance(x, int), votes.values())
+ passed = (1.0 * votes['yes_count'] / sum(totals)) >= 0.5
+ vote = Vote(motion=motion,
+ passed=passed,
+ chamber=chamber, date=date,
+ session=session, bill_id=bill_id,
+ bill_chamber=bill_chamber,
+ **votes)
+ vote.update(votes)
+ vote.add_source(url)
+ self.save_vote(vote)
+
+ def parse_votes(self, lines):
+
+ counts = collections.defaultdict(list)
+ DONE = 1
+ boundaries = [
+
+ # Senate journal.
+ ('Yeas', 'yes'),
+ ('Nays', 'no'),
+ ('Absent', 'other'),
+ ('Amendment', DONE),
+ ('Resolution', DONE),
+ ('Bill', DONE),
+
+ # House journal.
+ ('The ayes were', 'yes'),
+ ('The yeas were', 'yes'),
+ ('The nays were', 'no'),
+ ('Absent or not voting', 'other'),
+ ('The bill', DONE),
+ ('The joint resolution', DONE)]
+
+ def is_boundary(text):
+ for blurb, key in boundaries:
+ if text.strip().startswith(blurb):
+ return key
+
+ while True:
+ line = next(lines)
+ text = gettext(line)
+ if is_boundary(text):
+ break
+
+ while True:
+ key = is_boundary(text)
+ if key is DONE:
+ break
+
+ # Get the vote count.
+ m = re.search(r'\d+', text)
+ if not m:
+ if 'none' in text:
+ votecount = 0
+ else:
+ votecount = int(m.group())
+ counts['%s_count' % key] = votecount
+
+ # Get the voter names.
+ while True:
+ line = next(lines)
+ text = gettext(line)
+ if is_boundary(text):
+ break
+ elif not text.strip() or text.strip().isdigit():
+ continue
+ else:
+ for name in self.split_names(text):
+ counts['%s_votes' % key].append(name.strip())
+
+ return counts
+
+ def split_names(self, text):
+ text = text.strip()
+ if ' ' not in text:
+ return [text]
+ else:
+ names = []
+ chunks = text.split()[::-1]
+ name = [chunks.pop()]
+ while chunks:
+ chunk = chunks.pop()
+ if len(chunk) < 3:
+ name.append(chunk)
+ elif name[-1] in ('Mr.', 'Van', 'De'):
+ name.append(chunk)
+ else:
+ names.append(' '.join(name))
+ name = [chunk]
+
+ name = ' '.join(name)
+ if name and (name not in names):
+ names.append(name)
+ return names
+
+
+def _get_chunks(el, buff=None, until=None):
+ tagmap = {'br': '\n'}
+ buff = buff or []
+
+ # Tag, text, tail, recur...
+ yield tagmap.get(el.tag, '')
+ yield el.text or ''
+ if el.text == 'until':
+ return
+ for kid in el:
+ for text in _get_chunks(kid, until=until):
+ yield text
+ if text == until:
+ return
+ if el.tail:
+ yield el.tail
+ if el.tail == until:
+ return
+ if el.tag == 'text':
+ yield '\n'
+
+
+def gettext(el):
+ return ''.join(_get_chunks(el))
@@ -379,4 +379,5 @@ http://asmdc.org/members/a78/rss
http://asmdc.org/members/a79/home-page/rss
http://asmdc.org/members/a80/home-page/rss
http://thomneale.com
-http://www.baycitizen.org/feeds/stories/
+http://www.baycitizen.org/feeds/stories/
+http://www.sanluisobispo.com/news/politics/index.rss

0 comments on commit ce23541

Please sign in to comment.