Permalink
Browse files

MD: work towards 2013 session

  • Loading branch information...
1 parent d7b81ec commit 8c1a33bd5490409d467240abafcf8cdefc1e3173 @jamesturk jamesturk committed Jan 4, 2013
Showing with 91 additions and 141 deletions.
  1. +35 −39 manual_data/districts/md.csv
  2. +5 −2 openstates/md/__init__.py
  3. +7 −12 openstates/md/bills.py
  4. +44 −88 openstates/md/legislators.py
@@ -1,12 +1,23 @@
abbr,chamber,name,num_seats,boundary_id
-md,lower,4,3,sldl/md-4
-md,lower,5,3,sldl/md-5
+md,lower,1A,1,sldl/md-1a
+md,lower,1B,1,sldl/md-1b
+md,lower,1C,1,sldl/md-1c
+md,lower,2A,1,sldl/md-2a
+md,lower,2B,1,sldl/md-2b
+md,lower,2C,1,sldl/md-2c
+md,lower,3A,2,sldl/md-3a
+md,lower,3B,1,sldl/md-3b
+md,lower,4A,2,sldl/md-4a
+md,lower,4B,1,sldl/md-4b
+md,lower,5A,2,sldl/md-5a
+md,lower,5B,1,sldl/md-5b
md,lower,6,3,sldl/md-6
md,lower,7,3,sldl/md-7
md,lower,8,3,sldl/md-8
md,lower,10,3,sldl/md-10
md,lower,11,3,sldl/md-11
-md,lower,12,3,sldl/md-12
+md,lower,12A,2,sldl/md-12a
+md,lower,12B,1,sldl/md-12b
md,lower,13,3,sldl/md-13
md,lower,14,3,sldl/md-14
md,lower,15,3,sldl/md-15
@@ -17,55 +28,40 @@ md,lower,19,3,sldl/md-19
md,lower,20,3,sldl/md-20
md,lower,21,3,sldl/md-21
md,lower,22,3,sldl/md-22
+md,lower,23A,2,sldl/md-23a
+md,lower,23B,1,sldl/md-23b
md,lower,24,3,sldl/md-24
md,lower,25,3,sldl/md-25
md,lower,26,3,sldl/md-26
-md,lower,28,3,sldl/md-28
-md,lower,32,3,sldl/md-32
-md,lower,33,3,sldl/md-33
-md,lower,36,3,sldl/md-36
-md,lower,39,3,sldl/md-39
-md,lower,40,3,sldl/md-40
-md,lower,41,3,sldl/md-41
-md,lower,43,3,sldl/md-43
-md,lower,45,3,sldl/md-45
-md,lower,46,3,sldl/md-46
-md,lower,1A,1,sldl/md-1a
-md,lower,1B,1,sldl/md-1b
-md,lower,1C,1,sldl/md-1c
-md,lower,23A,2,sldl/md-23a
-md,lower,23B,1,sldl/md-23b
md,lower,27A,2,sldl/md-27a
md,lower,27B,1,sldl/md-27b
-md,lower,27C,1,sldl/md-27c
+md,lower,28,3,sldl/md-28
md,lower,29A,1,sldl/md-29a
md,lower,29B,1,sldl/md-29b
md,lower,29C,1,sldl/md-29c
-md,lower,2A,2,sldl/md-2a
-md,lower,2B,1,sldl/md-2b
-md,lower,30A,2,sldl/md-30a
-md,lower,30B,1,sldl/md-30b
-md,lower,31A,2,sldl/md-31a
-md,lower,31B,1,sldl/md-31b
+md,lower,30,3,sldl/md-30
+md,lower,31,3,sldl/md-31
+md,lower,32,3,sldl/md-32
+md,lower,33A,2,sldl/md-33a
+md,lower,33B,1,sldl/md-33b
md,lower,34A,2,sldl/md-34a
md,lower,34B,1,sldl/md-34b
md,lower,35A,2,sldl/md-35a
md,lower,35B,1,sldl/md-35b
-md,lower,37A,2,sldl/md-37a
-md,lower,37B,1,sldl/md-37b
+md,lower,36,3,sldl/md-36
+md,lower,37A,1,sldl/md-37a
+md,lower,37B,2,sldl/md-37b
md,lower,38A,1,sldl/md-38a
-md,lower,38B,1,sldl/md-38b
-md,lower,38C,1,sldl/md-38c
-md,lower,3A,2,sldl/md-3a
-md,lower,3B,1,sldl/md-3b
-md,lower,42A,2,sldl/md-42a
-md,lower,42B,1,sldl/md-42b
-md,lower,44A,2,sldl/md-44a
-md,lower,44B,1,sldl/md-44b
-md,lower,47A,2,sldl/md-47a
-md,lower,47B,1,sldl/md-47b
-md,lower,9A,2,sldl/md-9a
-md,lower,9B,1,sldl/md-9b
+md,lower,38B,2,sldl/md-38b
+md,lower,39,3,sldl/md-39
+md,lower,40,3,sldl/md-40
+md,lower,41,3,sldl/md-41
+md,lower,42,3,sldl/md-42
+md,lower,43,3,sldl/md-43
+md,lower,44,3,sldl/md-44
+md,lower,45,3,sldl/md-45
+md,lower,46,3,sldl/md-46
+md,lower,47,3,sldl/md-47
md,upper,1,1,sldu/md-1
md,upper,2,1,sldu/md-2
md,upper,3,1,sldu/md-3
@@ -15,7 +15,7 @@
'2009', '2010'],
'start_year': 2007, 'end_year': 2010},
{'name': '2011-2014', 'sessions': ['2011', '2011s1', '2012',
- '2012s1', '2012s2'],
+ '2012s1', '2012s2', '2013'],
'start_year': 2011, 'end_year': 2014},
],
session_details={
@@ -82,14 +82,17 @@
'type': 'special',
'_scraped_name': '2012 Special Session 2',
},
+ '2013': {'display_name': '2013 Regular Session',
+ 'type': 'special',
+ '_scraped_name': '2013 Regular Session',
+ },
},
feature_flags=['subjects', 'events', 'influenceexplorer'],
_ignored_scraped_sessions=['1996 Regular Session',
'1997 Regular Session',
'1998 Regular Session',
'1999 Regular Session',
'2000 Regular Session',
- '2013 Regular Session',
'2001 Regular Session',
'2002 Regular Session',
'2003 Regular Session',
View
@@ -158,14 +158,8 @@ def parse_bill_votes(self, doc, bill):
if (href and "votes" in href and href.endswith('htm') and
href not in seen_votes):
seen_votes.add(href)
- vote_url = href
-
- if bill['session'] in ('2007', '2007s1', '2008', '2009',
- '2010', '2011'):
- vote = self.parse_old_vote_page(vote_url)
- else:
- vote = self.parse_vote_page(vote_url)
- vote.add_source(vote_url)
+ vote = self.parse_vote_page(href)
+ vote.add_source(href)
bill.add_vote(vote)
@@ -222,7 +216,7 @@ def parse_vote_page(self, vote_url):
return vote
- def scrape_bill_old(self, chamber, session, bill_id, url):
+ def scrape_bill_2012(self, chamber, session, bill_id, url):
""" Creates a bill object """
if len(session) == 4:
session_url = session+'rs'
@@ -262,8 +256,9 @@ def scrape_bill_old(self, chamber, session, bill_id, url):
def scrape(self, chamber, session):
+ session_slug = session if 's' in session else session + 'rs'
- main_page = 'http://mgaleg.maryland.gov/webmga/frmLegislation.aspx?pid=legisnpage&tab=subject3&ys=' + session
+ main_page = 'http://mgaleg.maryland.gov/webmga/frmLegislation.aspx?pid=legisnpage&tab=subject3&ys=' + session_slug
chamber_prefix = 'S' if chamber == 'upper' else 'H'
html = self.urlopen(main_page)
doc = lxml.html.fromstring(html)
@@ -277,5 +272,5 @@ def scrape(self, chamber, session):
self.info('scraping %ss %s-%s', prefix, begin, end)
for number in range(int(begin), int(end)+1):
bill_id = prefix + str(number)
- url = 'http://mgaleg.maryland.gov/webmga/frmMain.aspx?id=%s&stab=01&pid=billpage&tab=subject3&ys=%s' % (bill_id, session)
- self.scrape_bill_old(chamber, session, bill_id, url)
+ url = 'http://mgaleg.maryland.gov/webmga/frmMain.aspx?id=%s&stab=01&pid=billpage&tab=subject3&ys=%s' % (bill_id, session_slug)
+ self.scrape_bill_2012(chamber, session, bill_id, url)
@@ -2,107 +2,63 @@
from collections import defaultdict
import lxml.html
-from billy.scrape import NoDataForPeriod
from billy.scrape.legislators import LegislatorScraper, Legislator
-PARTY_DICT = {'D': 'Democratic', 'R': 'Republican', 'I': 'Independent'}
+def _get_table_item(doc, name):
+ """ fetch items out of table that has a left column of th """
+ return doc.xpath('//th[contains(text(), "%s")]/following-sibling::td' % name)[0]
-BASE_URL = 'http://www.msa.md.gov'
class MDLegislatorScraper(LegislatorScraper):
jurisdiction = 'md'
latest_term = True
- def scrape(self, chamber, term):
- urls = {'lower': "http://www.msa.md.gov/msa/mdmanual/06hse/html/hseal.html",
- 'upper': "http://www.msa.md.gov/msa/mdmanual/05sen/html/senal.html"}
- detail_re = re.compile('\((R|D)\), (?:Senate President, )?(?:House Speaker, )?District (\w+)')
+ def scrape(self, term, chambers):
+ url = 'http://mgaleg.maryland.gov/webmga/frmmain.aspx?pid=legisrpage&tab=subject6'
- with self.urlopen(urls[chamber]) as html:
- doc = lxml.html.fromstring(html)
+ html = self.urlopen(url)
+ doc = lxml.html.fromstring(html)
+ doc.make_links_absolute(url)
+ sen_tbl, house_tbl = doc.xpath('//div[@class="legislrlist"]//table[@class="grid"]')
- # rest of data on this page is <li>s that have anchor tags
- for a in doc.xpath('//li/a'):
- link = a.get('href')
- # tags don't close so we get the <li> and <a> content and diff them
- name_text = a.text_content()
- detail_text = a.getparent().text_content().replace(name_text, '')
+ if 'upper' in chambers:
+ self.scrape_table(term, 'upper', sen_tbl)
+ if 'lower' in chambers:
+ self.scrape_table(term, 'lower', house_tbl)
- # ignore if it is not a valid link
- if link:
- # handle names
- names = name_text.split(',')
- last_name = names[0]
- first_name = names[1].strip()
- # TODO: try to trim first name to remove middle initial
- if len(names) > 2:
- suffixes = names[2]
- else:
- suffixes = ''
+ def scrape_table(self, term, chamber, tbl):
+ # skip first
+ for row in tbl.xpath('tr')[1:]:
+ leg_a, district, _, _ = row.xpath('td')
+ district = district.text
+ name = leg_a.text_content().strip()
+ leg_url = leg_a.xpath('a/@href')[0]
- # handle details
- details = detail_text.strip()
- party, district = detail_re.match(details).groups()
- party = PARTY_DICT[party]
+ # get details
+ html = self.urlopen(leg_url)
+ ldoc = lxml.html.fromstring(html)
+ ldoc.make_links_absolute(leg_url)
- leg_url = BASE_URL+link
+ party = _get_table_item(ldoc, 'Party Affiliation:').text
+ if party == 'Democrat':
+ party = 'Democratic'
+ address = '\n'.join(_get_table_item(ldoc, 'Annapolis Address:').xpath('text()'))
+ phone = _get_table_item(ldoc, 'Phone').text # first number
+ email = ldoc.xpath('//a[contains(@href, "mailto:")]/@href')
+ if email:
+ email = email[0].strip('mailto:')
+ else:
+ email = ''
- leg = Legislator(term, chamber, district,
- ' '.join((first_name, last_name)),
- first_name, last_name,
- party=party, suffixes=suffixes,
- url=leg_url)
- leg.add_source(url=leg_url)
+ leg = Legislator(term, chamber, district, name, party=party,
+ url=leg_url, email=email)
+ leg.add_source(url=leg_url)
- with self.urlopen(leg_url) as leg_html:
- leg_doc = lxml.html.fromstring(leg_html)
- img_src = leg_doc.xpath('//img[@align="left"]/@src')
- if img_src:
- leg['photo_url'] = BASE_URL + img_src[0]
+ # photo
+ img_src = ldoc.xpath('//img[@class="sponimg"]/@src')
+ if img_src:
+ leg['photo_url'] = img_src[0]
- # address extraction
- # this is pretty terrible, we get address in a format that looks
- # like:
- # James Senate Office Building, Room 322
- # 11 Bladen St., Annapolis, MD 21401
- # (410) 841-3565, (301) 858-3565; 1-800-492-7122, ext. 3565 (toll free)
- # e-mail: george.edwards@senate.state.md.us
- # fax: (410) 841-3552, (301) 858-3552
- #
- # Western Maryland Railway Station, 13 Canal St., Room 304, Cumberland, MD 21502
- # (301) 722-4780; 1-866-430-9553 (toll free)
- # e-mail: george.edwards.district@senate.state.md.us
- # fax: (301) 722-4790
- # usually first ul, sometimes first p
- try:
- addr_lines = leg_doc.xpath('//ul')[0].text_content().strip().splitlines()
- except IndexError:
- addr_lines = leg_doc.xpath('//p')[0].text_content().strip().splitlines()
- addr_pieces = {'capitol': defaultdict(str),
- 'district': defaultdict(str)}
- addr_type = 'capitol'
- for line in addr_lines:
- if '(401)' in line or '(301)' in line:
- addr_pieces[addr_type]['phone'] = line
- elif 'toll free' in line:
- pass # skip stand alone 1-800 numbers
- elif 'e-mail' in line:
- addr_pieces[addr_type]['email'] = line.replace('email: ',
- '')
- elif 'fax' in line:
- addr_pieces[addr_type]['fax'] = line.replace('fax: ', '')
- elif line == '':
- addr_type = 'district'
- else:
- addr_pieces[addr_type]['address'] += '{0}\n'.format(line)
- if addr_pieces['capitol']:
- leg.add_office('capitol', 'Capitol Office',
- **addr_pieces['capitol'])
- leg['email'] = (addr_pieces['capitol']['email'] or
- addr_pieces['district']['email'] or
- None)
- if addr_pieces['district']:
- leg.add_office('district', 'District Office',
- **addr_pieces['district'])
-
- self.save_legislator(leg)
+ leg.add_office('capitol', 'Capitol Office', address=address,
+ phone=phone)
+ self.save_legislator(leg)

0 comments on commit 8c1a33b

Please sign in to comment.