Permalink
Browse files

MN: 2013 fixes

  • Loading branch information...
1 parent e69f91d commit f7516517c1beeb26bbaea17f57ca0869a1204caa @jamesturk jamesturk committed Jan 4, 2013
Showing with 72 additions and 65 deletions.
  1. +1 −1 openstates/mn/__init__.py
  2. +71 −64 openstates/mn/bills.py
@@ -18,7 +18,7 @@
'biennium': '86',
},
{'name': '2011-2012',
- 'sessions': ['2011-2012', '2011s1'],
+ 'sessions': ['2011-2012', '2011s1', '2012s1'],
'start_year': 2011,
'end_year': 2012,
'biennium': '87',
View
@@ -57,7 +57,8 @@ def get_bill_topics(self, chamber, session):
BILL_DETAIL_URL_BASE, search_chamber, search_session, value)
with self.urlopen(opt_url) as opt_html:
opt_doc = lxml.html.fromstring(opt_html)
- for bill in opt_doc.xpath('//table[@width="80%"]/tr/td[2]/a/font/text()'):
+ for bill in opt_doc.xpath('//table/tr/td[2]/a/text()'):
+ bill = re.sub(r'(\w+?)0*(\d+)', r'\1\2', bill)
self._subject_mapping[bill].append(subject)
def extract_bill_actions(self, doc, current_chamber):
@@ -73,52 +74,52 @@ def extract_bill_actions(self, doc, current_chamber):
"""
bill_actions = list()
- action_tables = doc.xpath('//table[@summary="Actions"]')
+ action_tables = doc.xpath('//table[@class="actions"]')
for cur_table in action_tables:
- for row in cur_table.xpath('tr')[1:]:
+ for row in cur_table.xpath('.//tr'):
bill_action = dict()
# split up columns
- (date_col, action_col, desc_col, text_col,
- page_col, rc_col) = row.xpath('td')
+ date_col, the_rest = row.xpath('td')
action_date = date_col.text_content().strip()
- action_text = action_col.text_content().strip()
- description = desc_col.text_content().strip()
+ action_text = the_rest.text.strip()
+ extra = ''.join(the_rest.xpath('span[not(@style)]/text()'))
# skip non-actions (don't have date)
if action_text in ('Chapter number', 'See also', 'See',
'Effective date', 'Secretary of State'):
continue
- # dates are really inconsistent here
+ # dates are really inconsistent here, sometimes in action_text
try:
action_date = datetime.datetime.strptime(action_date,
'%m/%d/%Y')
except ValueError:
try:
- action_date = datetime.datetime.strptime(description,
+ action_date = datetime.datetime.strptime(extra,
'%m/%d/%y')
except ValueError:
try:
action_date = datetime.datetime.strptime(
- description, '%m/%d/%Y')
+ extra, '%m/%d/%Y')
except ValueError:
self.warning('ACTION without date: %s' %
action_text)
continue
+
# categorize actions
action_type = 'other'
for pattern, atype in self._categorizers:
if re.match(pattern, action_text):
action_type = atype
if 'committee:referred' in action_type:
- bill_action['committees'] = description
+ bill_action['committees'] = extra
break
- if description:
- action_text += ' ' + description
+ if extra:
+ action_text += ' ' + extra
bill_action['action_text'] = action_text
if isinstance(action_type, list):
for atype in action_type:
@@ -151,56 +152,62 @@ def get_bill_info(self, chamber, session, bill_detail_url, version_list_url):
"""
if chamber == "House":
chamber = 'lower'
+ other_chamber = 'upper'
else:
chamber = 'upper'
+ chamber = 'lower'
- with self.urlopen(bill_detail_url) as bill_html:
- doc = lxml.html.fromstring(bill_html)
-
- bill_id = doc.xpath('//title/text()')[0].split()[0]
- bill_title = doc.xpath('//font[@size=-1]/text()')[0]
- bill_type = {'F': 'bill', 'R':'resolution',
- 'C': 'concurrent resolution'}[bill_id[1]]
- bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)
- bill['subjects'] = self._subject_mapping[bill_id]
- bill.add_source(bill_detail_url)
-
- # grab sponsors
- sponsors = doc.xpath('//table[@summary="Show Authors"]/descendant::a/text()')
- if sponsors:
- primary_sponsor = sponsors[0].strip()
- bill.add_sponsor('primary', primary_sponsor, chamber=chamber)
- cosponsors = sponsors[1:]
- for leg in cosponsors:
- bill.add_sponsor('cosponsor', leg.strip(), chamber=chamber)
-
- # Add Actions performed on the bill.
- bill_actions = self.extract_bill_actions(doc, chamber)
- for action in bill_actions:
- kwargs = {}
- if 'committee' in action:
- kwargs['committees'] = action['committees']
-
- bill.add_action(action['action_chamber'],
- action['action_text'],
- action['action_date'],
- type=action['action_type'],
- **kwargs)
+ bill_html = self.urlopen(bill_detail_url)
+ doc = lxml.html.fromstring(bill_html)
+
+ bill_id = doc.xpath('//h1/text()')[0]
+ bill_title = doc.xpath('//h2/following-sibling::p/text()')[0].strip()
+ bill_type = {'F': 'bill', 'R':'resolution',
+ 'C': 'concurrent resolution'}[bill_id[1]]
+ bill = Bill(session, chamber, bill_id, bill_title, type=bill_type)
+ bill['subjects'] = self._subject_mapping[bill_id]
+ bill.add_source(bill_detail_url)
+
+ # grab sponsors
+ sponsors = doc.xpath('//h2[text()="Authors"]/following-sibling::ul[1]/li/a/text()')
+ if sponsors:
+ primary_sponsor = sponsors[0].strip()
+ bill.add_sponsor('primary', primary_sponsor, chamber=chamber)
+ cosponsors = sponsors[1:]
+ for leg in cosponsors:
+ bill.add_sponsor('cosponsor', leg.strip(), chamber=chamber)
+
+ other_sponsors = doc.xpath('//h3[contains(text(), "Authors")]/following-sibling::ul[1]/li/a/text()')
+ for leg in other_sponsors:
+ bill.add_sponsor('cosponsor', leg.strip(), chamber=other_chamber)
+
+ # Add Actions performed on the bill.
+ bill_actions = self.extract_bill_actions(doc, chamber)
+ for action in bill_actions:
+ kwargs = {}
+ if 'committee' in action:
+ kwargs['committees'] = action['committees']
+
+ bill.add_action(action['action_chamber'],
+ action['action_text'],
+ action['action_date'],
+ type=action['action_type'],
+ **kwargs)
# Get all versions of the bill.
# Versions of a bill are on a separate page, linked to from the column
# labeled, "Bill Text", on the search results page.
- with self.urlopen(version_list_url) as version_html:
- if 'resolution' in version_html.response.url:
- bill.add_version('resolution text', version_html.response.url,
+ version_html = self.urlopen(version_list_url)
+ if 'resolution' in version_html.response.url:
+ bill.add_version('resolution text', version_html.response.url,
+ mimetype='text/html')
+ else:
+ version_doc = lxml.html.fromstring(version_html)
+ for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'):
+ version_url = urlparse.urljoin(VERSION_URL_BASE,
+ v.get('href'))
+ bill.add_version(v.text.strip(), version_url,
mimetype='text/html')
- else:
- version_doc = lxml.html.fromstring(version_html)
- for v in version_doc.xpath('//a[starts-with(@href, "/bin/getbill.php")]'):
- version_url = urlparse.urljoin(VERSION_URL_BASE,
- v.get('href'),
- mimetype='text/html')
- bill.add_version(v.text.strip(), version_url)
self.save_bill(bill)
@@ -232,17 +239,17 @@ def scrape(self, chamber, session):
url = search_url % (search_chamber, search_session, start,
start+stride, bill_type)
- with self.urlopen(url) as html:
- doc = lxml.html.fromstring(html)
+ html = self.urlopen(url)
+ doc = lxml.html.fromstring(html)
- # get table containing bills
- rows = doc.xpath('//table[@width="80%"]/tr')[1:]
- total_rows.extend(rows)
+ # get table containing bills
+ rows = doc.xpath('//table/tr')[1:]
+ total_rows.extend(rows)
- # out of rows
- if len(rows) == 0:
- self.debug("Total Bills Found: %d" % len(total_rows))
- break
+ # out of rows
+ if len(rows) == 0:
+ self.debug("Total Bills Found: %d" % len(total_rows))
+ break
# process each row
for row in total_rows:

0 comments on commit f751651

Please sign in to comment.