Permalink
Browse files

make scraper more robust: handle comment that ends in HTML tag

  • Loading branch information...
1 parent 7157af7 commit 00b4296f6fe3723ce32a87ce071cc41518df9eb9 @cygri cygri committed Aug 2, 2012
Showing with 3 additions and 1 deletion.
  1. +3 −1 scrapers/schema_scraper.py
@@ -58,7 +58,9 @@ def get_type_details(url):
type['comment_plain'] = el.tail
while el.getnext().tag not in ['div', 'h3', 'table']:
type['comment'] += lxml.etree.tostring(el.getnext())
- type['comment_plain'] += el.getnext().text_content() + el.getnext().tail
+ type['comment_plain'] += el.getnext().text_content();
+ if el.getnext().tail != None:
+ type['comment_plain'] += el.getnext().tail
el = el.getnext()
if type['comment'] == None:
print >> sys.stderr, 'WARNING: No comment in type ' + id

0 comments on commit 00b4296

Please sign in to comment.