Permalink
Browse files

AR, DE, HI, ID extract_text

  • Loading branch information...
1 parent ad201d3 commit 8843c01e001b62e693b172c0095b590c855b37a2 @jamesturk jamesturk committed Apr 24, 2012
Showing with 44 additions and 4 deletions.
  1. +10 −1 openstates/ar/__init__.py
  2. +11 −1 openstates/de/__init__.py
  3. +13 −1 openstates/hi/__init__.py
  4. +10 −1 openstates/id/__init__.py
@@ -1,4 +1,6 @@
import datetime
+from billy.fulltext import (pdfdata_to_text, oyster_text,
+ text_after_line_numbers)
metadata = dict(
name='Arkansas',
@@ -73,8 +75,15 @@ def session_list():
'//div[@id="ctl00_ctl15_g_95338513_84cb_48ec_85d1_4e6a889e8035_panel"]//a')
return [s.text_content() for s in sessions if s.text_content()]
+
+@oyster_text
+def extract_text(oyster_doc, data):
+ return text_after_line_numbers(pdfdata_to_text(data))
+
+
document_class = dict(
AWS_PREFIX = 'documents/ar/',
update_mins = 7*24*60,
+ extract_text = extract_text,
onchanged = []
-)
+)
@@ -1,4 +1,6 @@
import datetime
+import lxml.html
+from billy.fulltext import oyster_text
metadata = dict(
name='Delaware',
@@ -33,8 +35,16 @@ def session_list():
sessions.remove("Session")
return sessions
+@oyster_text
+def extract_text(oyster_doc, data):
+ if oyster_doc['metadata']['mimetype'] == 'text/html':
+ doc = lxml.html.fromstring(data)
+ return ' '.join(x.text_content()
+ for x in doc.xpath('//p[@class="MsoNormal"]'))
+
document_class = dict(
AWS_PREFIX = 'documents/de/',
update_mins = 7*24*60,
+ extract_text = extract_text,
onchanged = []
-)
+)
@@ -1,3 +1,6 @@
+from billy.fulltext import oyster_text
+import lxml.html
+
metadata = dict(
name='Hawaii',
abbreviation='hi',
@@ -45,8 +48,17 @@ def session_list():
sessions.remove("Archives Main")
return sessions
+@oyster_text
+def extract_text(oyster_doc, data):
+ doc = lxml.html.fromstring(data)
+ if oyster_doc['metadata']['mimetype'] == 'text/html':
+ content = doc.xpath('//div[@class="Section2"]')[0].text_content()
+ content += doc.xpath('//div[@class="Section3"]')[0].text_content()
+ return content
+
document_class = dict(
AWS_PREFIX = 'documents/hi/',
update_mins = None,
+ extract_text = extract_text,
onchanged = []
-)
+)
@@ -1,4 +1,7 @@
import datetime
+from billy.fulltext import (pdfdata_to_text, oyster_text,
+ text_after_line_numbers)
+
metadata = dict(
name='Idaho',
abbreviation='id',
@@ -175,8 +178,14 @@ def session_list():
return url_xpath('http://legislature.idaho.gov/priorsessions.htm',
'//td[@width="95%"]/ul/li/a/text()')[:-1]
+
+@oyster_text
+def extract_text(oyster_doc, data):
+ return text_after_line_numbers(pdfdata_to_text(data))
+
document_class = dict(
AWS_PREFIX = 'documents/id/',
update_mins = 7*24*60,
+ extract_text = extract_text,
onchanged = []
-)
+)

0 comments on commit 8843c01

Please sign in to comment.