Permalink
Browse files

merge conflicts fixed

  • Loading branch information...
2 parents 9ac3ced + d559b36 commit 8d4777ae1cd38db8e44b259090d0f1a3aad7f9be @jamesturk jamesturk committed Feb 6, 2013
Showing with 7,223 additions and 5,128 deletions.
  1. +54 −0 billy_settings.py
  2. +12 −5 experimental/legatron/entities.py
  3. +41 −19 experimental/legatron/models.py
  4. +15 −8 experimental/legatron/scrape.py
  5. +201 −201 manual_data/districts/ma.csv
  6. +5 −3 openstates/ar/bills.py
  7. +4 −1 openstates/md/bills.py
  8. +17 −0 openstates/wa/bills.py
  9. +1 −5 openstates/wv/__init__.py
  10. +1 −0 requirements-site.txt
  11. +27 −0 scripts/check_district_boundaries.py
  12. +133 −49 scripts/purge_old_committee_ids.py
  13. +67 −0 scripts/purge_old_events_bill_ids.py
  14. +19 −0 site/media/ie8.css
  15. BIN site/media/images/bg.jpg
  16. BIN site/media/images/bg_appBox.png
  17. BIN site/media/images/bg_bill1.png
  18. BIN site/media/images/bg_bill2.png
  19. BIN site/media/images/bg_bill3.png
  20. BIN site/media/images/bg_bill4.png
  21. BIN site/media/images/bg_bill5.png
  22. BIN site/media/images/bg_dataBox.png
  23. BIN site/media/images/bg_footer.jpg
  24. BIN site/media/images/bg_map.jpg
  25. BIN site/media/images/bg_resources.png
  26. BIN site/media/images/bg_resourcesTitle.png
  27. BIN site/media/images/bg_sunlightLogo.png
  28. BIN site/media/images/bg_tagline.jpg
  29. BIN site/media/images/bg_tagline_star.png
  30. BIN site/media/images/btn_calendar_prevNext.png
  31. BIN site/media/images/btn_locate.png
  32. BIN site/media/images/btn_searchAddress.png
  33. BIN site/media/images/hat.jpg
  34. +0 −1,989 site/media/images/openstatesmap.svg
  35. BIN site/media/images/placeholder.png
  36. BIN site/media/images/placeholder_calendar.png
  37. BIN site/media/images/productof.jpg
  38. BIN site/media/images/state/connecticut.jpg
  39. +4 −0 site/media/js/d3.v3.min.js
  40. +0 −1,833 site/media/js/davis.js
  41. +4 −0 site/media/js/jquery-1.9.0.min.js
  42. +99 −0 site/media/js/jquery.calendar-widget.js
  43. +79 −68 site/media/js/jquery.pjax.min.js
  44. +150 −18 site/media/js/main.js
  45. +921 −99 site/media/main.css
  46. +21 −57 site/media/select2-2.0/select2.css
  47. +12 −32 site/openstates_site/settings.py
  48. +5 −0 site/openstates_site/urls.py
  49. +71 −0 site/templates/billy/web/public/_event.html
  50. +2 −2 site/templates/billy/web/public/_favorite.html
  51. +24 −0 site/templates/billy/web/public/_favorite_short.html
  52. +13 −0 site/templates/billy/web/public/_legislator_vote_column.html
  53. +10 −0 site/templates/billy/web/public/_mimetype_tag.html
  54. +9 −8 site/templates/billy/web/public/_notification_preference.html
  55. +15 −4 site/templates/billy/web/public/_recently_introduced.html
  56. +21 −6 site/templates/billy/web/public/_recently_passed.html
  57. +15 −0 site/templates/billy/web/public/_vote_chart_table.html
  58. +51 −22 site/templates/billy/web/public/base.html
  59. +164 −115 site/templates/billy/web/public/bill.html
  60. +107 −24 site/templates/billy/web/public/bills_list.html
  61. +21 −10 site/templates/billy/web/public/bio_blurb.html
  62. +17 −8 site/templates/billy/web/public/committee.html
  63. +9 −1 site/templates/billy/web/public/committees-pjax.html
  64. +10 −4 site/templates/billy/web/public/committees.html
  65. +1 −1 site/templates/billy/web/public/committees_table.html
  66. +20 −11 site/templates/billy/web/public/document.html
  67. +45 −73 site/templates/billy/web/public/event.html
  68. +9 −0 site/templates/billy/web/public/events-pjax.html
  69. +164 −8 site/templates/billy/web/public/events.html
  70. +0 −6 site/templates/billy/web/public/events_list_row.html
  71. +1 −1 site/templates/billy/web/public/feed_entry.html
  72. +32 −27 site/templates/billy/web/public/find_your_legislator.html
  73. +1 −1 site/templates/billy/web/public/find_your_legislator_table.html
  74. +79 −43 site/templates/billy/web/public/homepage.html
  75. +147 −116 site/templates/billy/web/public/legislator.html
  76. +2 −2 site/templates/billy/web/public/legislator_table.html
  77. +6 −1 site/templates/billy/web/public/legislators-pjax.html
  78. +16 −15 site/templates/billy/web/public/legislators.html
  79. +1 −1 site/templates/billy/web/public/legislators_list_row.html
  80. +2 −2 site/templates/billy/web/public/legislators_list_row_with_abbr.html
  81. +0 −36 site/templates/billy/web/public/login.html
  82. +76 −66 site/templates/billy/web/public/region.html
  83. +1 −1 site/templates/billy/web/public/region_select_form.html
  84. +2 −6 site/templates/billy/web/public/search_results_bills_legislators.html
  85. +3 −2 site/templates/billy/web/public/sources.html
  86. +58 −54 site/templates/billy/web/public/user_favorites.html
  87. +50 −45 site/templates/billy/web/public/user_profile.html
  88. +10 −19 site/templates/billy/web/public/vote.html
  89. +3 −0 site/templates/flat/howto-make-svg-embed.txt
  90. +2,004 −0 site/templates/flat/openstatesmap-embed.svg
  91. +2,039 −0 site/templates/flat/openstatesmap.svg
View
@@ -27,6 +27,60 @@
ENABLE_ELASTICSEARCH = True
BOUNDARY_SERVICE_SETS = 'sldl,sldu'
+ENABLE_DOCUMENT_VIEW = {
+ 'ak': True,
+ 'al': True,
+ 'ar': False, # revisit
+ 'az': True,
+ 'ca': False,
+ 'co': False, # revisit
+ 'ct': True,
+ 'dc': True,
+ 'de': True,
+ 'fl': True,
+ 'ga': False,
+ 'hi': True,
+ 'ia': True,
+ 'id': True,
+ 'il': True,
+ 'in': True,
+ 'ks': True,
+ 'ky': True,
+ 'la': False, # revisit
+ 'ma': False,
+ 'md': True,
+ 'me': True,
+ 'mi': True,
+ 'mn': False,
+ 'mo': True,
+ 'ms': True,
+ 'mt': True,
+ 'nc': True,
+ 'nd': True,
+ 'ne': True,
+ 'nh': True,
+ 'nj': True,
+ 'nm': True,
+ 'nv': True,
+ 'ny': False,
+ 'oh': True,
+ 'ok': True,
+ 'or': True,
+ 'pa': False, # revisit
+ 'pr': True,
+ 'ri': False, # revisit
+ 'sc': True,
+ 'sd': False,
+ 'tn': True,
+ 'tx': False, # revisit
+ 'ut': True,
+ 'va': False,
+ 'vt': True,
+ 'wa': False, # revisit
+ 'wi': True,
+ 'wv': False,
+ 'wv': True
+}
try:
from billy_local import *
@@ -35,10 +35,11 @@ class BogusEntry(Exception):
'''Raised when an entry lacks a required attribute, like 'link'.'''
-def new_feed_id(entry, cache={}):
+def new_entry_id(entry, cache={}):
'''Generate an entry id using the hash value of the title and link.
'''
- return hashlib.md5(entry['link'] + entry['title']).hexdigest()
+ s = (entry['link'] + entry['title']).encode('ascii', 'ignore')
+ return hashlib.md5(s).hexdigest()
PATH = dirname(abspath(__file__))
@@ -275,7 +276,7 @@ def process_entry(self, entry):
entry['save_time'] = datetime.datetime.utcnow()
try:
- entry['_id'] = new_feed_id(entry)
+ entry['_id'] = new_entry_id(entry)
except BogusEntry:
# This entry appears to be malformed somehow. Skip.
msg = 'Skipping malformed feed: %s'
@@ -285,7 +286,10 @@ def process_entry(self, entry):
entry['_type'] = 'feedentry'
- entry['summary'] = clean_html(entry['summary'])
+ try:
+ entry['summary'] = clean_html(entry['summary'])
+ except KeyError:
+ return
try:
entry['summary_detail']['value'] = clean_html(
entry['summary_detail']['value'])
@@ -323,7 +327,10 @@ def process_entry(self, entry):
# Save
msg = 'Found %d related entities in %r'
- self.logger.info(msg % (len(ids), entry['title']))
+ if ids:
+ self.logger.info(msg % (len(ids), entry['title']))
+ else:
+ self.logger.debug(msg % (len(ids), entry['title']))
return entry
# feed_db.entries.save(entry)
@@ -112,7 +112,7 @@ def _initial_save(self):
'''
spec = dict(url=self.url)
update = {'$set': spec}
- self.logger.info('feed._initial_save %r' % self.url)
+ self.logger.debug('feed._initial_save %r' % self.url)
doc = feeds_db.feeds.find_and_modify(
spec, update, upsert=True, new=True)
self.mongo_id = doc['_id']
@@ -127,21 +127,27 @@ def _get_feed(self):
except Exception:
tb = traceback.format_exc()
self._handle_fetch_exception(tb)
+ self._update_report_after_fetch()
else:
self.succeeded = True
# XXX: This will fail if the text isn't a valid feed.
data = feedparser.parse(text)
self._data = data
-
- self._update_report_after_fetch()
+ self._update_report_after_fetch()
+ return data
@property
def data(self):
'''The parsed feed contents.
'''
data = getattr(self, '_data', None)
- return data or self._get_feed()
+ return data or self._get_feed() or {}
+
+ def is_valid(self):
+ '''Does this hot garbage contain the keys we expect?
+ '''
+ return 'title' in self.data.get('feed', {})
def _handle_fetch_exception(self, _traceback):
'''If the fetch fails, log the exception and store the traceback for
@@ -182,11 +188,13 @@ def finish_report(self):
'''
def save(self):
+ '''Update the feed record with the latest report.
'''
- '''
+ if not self.is_valid():
+ return
spec = dict(url=self.url)
update = {'$set': self.report}
- self.logger.info('feed.finish_report %r' % self.url)
+ self.logger.debug('feed.finish_report %r' % self.url)
feeds_db.feeds.find_and_modify(spec, update, upsert=True, new=True)
self.logger.info('feed.save: %r' % self.url)
@@ -205,12 +213,22 @@ class Entry(object):
def __init__(self, entry, feed):
self.entry = entry
self.feed = feed
- self.report = {}
+ self.report = {
+ 'entities': {
+ 'count' : 0,
+ }
+ }
# Whether a fetch of the full text was tried and succeeded.
self.tried = False
self.succeeded = None
+ def is_valid(self):
+ '''Does this hot garbage contain the keys we expect?
+ '''
+ valid = set(['summary', 'link', 'title'])
+ return valid < set(self.entry)
+
@staticmethod
def blast_cache(self):
'''Just in case you want to blast the entries cache.
@@ -221,6 +239,7 @@ def mongo_id(self):
'''Get a unique mongo id based on this entry's url and title.
'''
s = self.entry['link'] + self.entry['title']
+ s = s.encode('ascii', 'ignore')
return hashlib.md5(s).hexdigest()
def is_new(self):
@@ -232,14 +251,15 @@ def is_new(self):
is_new = True
else:
is_new = False
- self.logger.info('is_new? %r --> %r' % (mongo_id, is_new))
+ self.logger.debug('is_new? %r --> %r' % (mongo_id, is_new))
+ return is_new
def _get_full_text(self):
'''Just for experimenting at this point. Fetch the full text,
log any exception the occurs, and store the details regarding the
outcome of the fetch on the object.
'''
- self.logger.info('entry GET %r' % self.entry.link)
+ self.logger.debug('entry GET %r' % self.entry.link)
try:
html = self.session.get(self.entry.link).text
except Exception:
@@ -272,6 +292,7 @@ def _update_report_after_fetch(self):
'url': self.url,
'entity_count': len(self['entity_ids'])
}
+
if self.tried:
last_fetch = {
'succeeded': self.succeeded,
@@ -287,23 +308,29 @@ def serializable(self):
json serialized.
'''
# Add the feed's id to make the entry and its feed joinable.
- ret = dict(feed_id=self.feed.mongo_id)
+ ret = {}
+ ret['feed_id'] = self.feed.mongo_id
# Convert unserializable timestructs into datetimes.
for k, v in self.entry.items():
if isinstance(v, time.struct_time):
t = time.mktime(self.entry[k])
dt = datetime.datetime.fromtimestamp(t)
ret[k] = dt
+ elif '.' not in k:
+ ret[k] = v
+
return ret
def save_if_entities_found(self):
'''If the entry is previously unseen and the extractor finds entities
have been mentioned, save, otherwise do nothing.
'''
- if self.is_new() and self.entry['entity_ids']:
+ if self.is_valid() and self.is_new() and self.entry['entity_ids']:
feeds_db.entries.save(self.serializable())
- self.logger('entry.save_if_entities_found: %r' % self.entry.link)
+ msg = 'found %d entities: %r'
+ args = (len(self.entry['entity_ids']), self.entry.link)
+ self.logger.debug(msg % args)
def finish_report(self, abbr):
'''After attempting to extract entities, update the report and the
@@ -335,16 +362,11 @@ def finish_report(self, abbr):
report['entries']['count'] += 1
# If this is a new entry...
- if self.tried:
+ if self.is_new():
report['entries']['new'] += 1
if self.entry['entity_ids']:
report['entries']['relevant'] += 1
report['entities']['count'] += len(self.entry['entity_ids'])
+ self.report['entities']['count'] += len(self.entry['entity_ids'])
else:
report['entries']['old'] += 1
-
-
-
-
-
-
@@ -3,11 +3,19 @@
from os.path import dirname, abspath, join
import shutil
+from billy.core import logging
+
from models import Feed
from entities import Extractor
if __name__ == '__main__':
+ level = logging.DEBUG
+
+ logging.getLogger('billy.feed-model').setLevel(level)
+ logging.getLogger('billy.entry-model').setLevel(level)
+ logging.getLogger('billu.extractor').setLevel(level)
+
# The path where the news/blogs code and urls files are located.
PATH = dirname(abspath(__file__))
@@ -44,14 +52,13 @@
extractor = Extractor(abbr)
for url in urls:
feed = Feed(url, jurisdiction=abbr)
- import ipdb;ipdb.set_trace()
- et = list(feed.entries())
- if not et:
- import ipdb;ipdb.set_trace()
+ if not feed.is_valid():
+ continue
+
for entry in feed.entries():
- extractor.process_entry(entry.entry)
- entry.finish_report(abbr)
- entry.save_if_entities_found()
+ if entry.is_valid():
+ extractor.process_entry(entry.entry)
+ entry.finish_report(abbr)
+ entry.save_if_entities_found()
feed.finish_report()
feed.save()
-
Oops, something went wrong.

0 comments on commit 8d4777a

Please sign in to comment.