Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

Update IL scraper for older sessions #105

Merged
merged 4 commits into from Nov 7, 2011
View
@@ -13,9 +13,25 @@
'terms': [
{'name': '97th', 'sessions': ['97th'],
'start_year': 2011, 'end_year': 2012},
+ {'name': '96th', 'sessions': ['96th'],
+ 'start_year': 2009, 'end_year': 2010},
+ {'name': '95th', 'sessions': ['95th', 'Special_95th'],
+ 'start_year': 2007, 'end_year': 2008},
+ {'name': '94th', 'sessions': ['94th'],
+ 'start_year': 2005, 'end_year': 2006},
+ {'name': '93rd', 'sessions': ['93rd'],
+ 'start_year': 2003, 'end_year': 2004},
],
'feature_flags': [],
'session_details': {
- '97th': {'display_name': '97th Regular Session', 'session_id': '84'},
+ '97th': {'display_name': '97th Regular Session', 'params': { 'GA': '97', 'SessionId': '84' }},
+ '96th': {'display_name': '96th Regular Session', 'params': { 'GA': '96', 'SessionId': '76' }},
+ 'Special_96th': {'display_name': '96th Special Session', 'params': { 'GA': '96', 'SessionId': '82', 'SpecSess': '1' }},
+ '95th': {'display_name': '95th Regular Session', 'params': { 'GA': '95', 'SessionId': '51' }},
+ 'Special_95th': {'display_name': '95th Special Session', 'params': { 'GA': '95', 'SessionId': '52', 'SpecSess': '1' }},
+ '94th': {'display_name': '94th Regular Session', 'params': { 'GA': '94', 'SessionId': '50' }},
+ '93rd': {'display_name': '93rd Regular Session', 'params': { 'GA': '93', 'SessionId': '3' }},
+ 'Special_93rd': {'display_name': '93rd Special Session', 'params': { 'GA': '93', 'SessionID': '14', 'SpecSess': '1' }},
}
}
+
View
@@ -3,6 +3,7 @@
import os
import datetime
import lxml.html
+from urllib import urlencode
from billy.scrape.bills import BillScraper, Bill
from billy.scrape.votes import Vote
@@ -16,10 +17,6 @@ def group(lst, n):
yield tuple(val)
-# chamber prefix, doc id, session_id
-LEGISLATION_URL = ('http://ilga.gov/legislation/grplist.asp?num1=1&num2=10000&'
- 'DocTypeID=%s%s&SessionID=%s')
-
TITLE_REMOVING_PATTERN = re.compile(".*(Rep|Sen). (.+)$")
SPONSOR_PATTERN = re.compile("^(Added |Removed )?(.+Sponsor) (Rep|Sen). (.+)$")
@@ -61,29 +58,38 @@ def _categorize_action(action):
return atype
return 'other'
+LEGISLATION_URL = ('http://ilga.gov/legislation/grplist.asp')
-class ILBillScraper(BillScraper):
+def build_url_for_legislation_list(metadata, chamber, session, doc_type):
+ base_params = metadata['session_details'][session].get('params',{})
+ base_params['num1'] = '1'
+ base_params['num2'] = '10000'
+ params = dict(base_params)
+ params['DocTypeID'] = '%s%s' % (chamber_slug(chamber),doc_type)
+ return '?'.join([LEGISLATION_URL,urlencode(params)])
- state = 'il'
+def chamber_slug(chamber):
+ if chamber == 'lower':
+ return 'H'
+ return 'S'
+class ILBillScraper(BillScraper):
+ state = 'il'
+ def get_bill_urls(self, chamber, session, doc_type):
+ url = build_url_for_legislation_list(self.metadata, chamber, session, doc_type)
+ html = self.urlopen(url)
+ doc = lxml.html.fromstring(html)
+ doc.make_links_absolute(url)
+ for bill_url in doc.xpath('//li/a/@href'):
+ yield bill_url
+
def scrape(self, chamber, session):
- session_id = self.metadata['session_details'][session]['session_id']
- chamber_slug = 'H' if chamber == 'lower' else 'S'
-
-
for doc_type in DOC_TYPES:
- url = LEGISLATION_URL % (chamber_slug, doc_type, session_id)
- html = self.urlopen(url)
- doc = lxml.html.fromstring(html)
- doc.make_links_absolute(url)
-
- for bill_url in doc.xpath('//li/a/@href'):
- self.scrape_bill(chamber, session, chamber_slug+doc_type,
- bill_url)
-
-
+ for bill_url in self.get_bill_urls(chamber, session, doc_type):
+ self.scrape_bill(chamber, session, chamber_slug(chamber)+doc_type, bill_url)
+
def scrape_bill(self, chamber, session, doc_type, url):
html = self.urlopen(url)
doc = lxml.html.fromstring(html)
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+
+import unittest
+from openstates.il import metadata
+from openstates.il.bills import DOC_TYPES, ILBillScraper
+import logging
+
+log = logging.getLogger('openstates.il.tests.test_bill_metadata')
+
+class TestBillMetadata(unittest.TestCase):
+ """Run a basic sanity check to ensure that something would get scraped for each session in the metadata"""
+
+ def setUp(self):
+ self.scraper = ILBillScraper(metadata)
+
+ def test_lists(self):
+ chambers = ['H','S']
+ sessions = []
+ for term in metadata['terms']:
+ sessions.extend(term['sessions'])
+ self.assertTrue(len(sessions) > 0, "Expected non-zero list of sessions")
+
+ for session in sessions:
+ for chamber in chambers:
+ session_chamber_count = 0
+ for doc_type in DOC_TYPES:
+ count = len(list(self.scraper.get_bill_urls(chamber, session, doc_type)))
+ log.info("Session: %s Chamber: %s Doc Type: %s Count: %i" % (session, chamber, doc_type, count))
+ session_chamber_count += count
+ self.assertTrue(session_chamber_count > 0, "Expected non-zero bill count for Session %s, Chamber %s" % (session, chamber))
+if __name__ == '__main__':
+ unittest.main()
+