Skip to content

Commit

Permalink
Merge pull request #48 from dbs/json_comments
Browse files Browse the repository at this point in the history
JSON: comments for types, and property URLs
  • Loading branch information
mhausenblas committed Dec 11, 2013
2 parents d102ce1 + 7a636fb commit f67e711
Showing 1 changed file with 5 additions and 14 deletions.
19 changes: 5 additions & 14 deletions scrapers/schema_scraper.py
Expand Up @@ -53,19 +53,9 @@ def get_type_details(url):
type['ancestors'] = []
for a in ancestor_links:
type['ancestors'].append(a.text_content())
el = root.cssselect("h1.page-title")[0]
type['comment'] = el.tail
type['comment_plain'] = el.tail
while el.getnext().tag not in ['div', 'h3', 'table']:
type['comment'] += lxml.etree.tostring(el.getnext())
type['comment_plain'] += el.getnext().text_content();
if el.getnext().tail != None:
type['comment_plain'] += el.getnext().tail
el = el.getnext()
if type['comment'] == None:
print >> sys.stderr, 'WARNING: No comment in type ' + id
type['comment'] = type['comment'].strip()
type['comment_plain'] = type['comment_plain'].strip()
el = root.cssselect("div[property='rdfs:comment']")[0]
type['comment'] = get_inner_html(el)
type['comment_plain'] = el.text_content().strip()
type['instances'] = []
type['subtypes'] = []
for section in root.cssselect("h3"):
Expand Down Expand Up @@ -99,7 +89,8 @@ def get_type_details(url):
'domains': [id],
'ranges': re.sub('\s+', ' ', row.cssselect("td.prop-ect")[0].text_content()).strip().split(' or '),
'comment': get_inner_html(comment),
'comment_plain': comment.text_content()
'comment_plain': comment.text_content(),
'url': base_url + id
})
return type

Expand Down

0 comments on commit f67e711

Please sign in to comment.