From f0ca526838163a7e80acd700bf9191df0076c182 Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Thu, 1 Aug 2013 21:01:24 -0400 Subject: [PATCH 1/2] Use the rdfs:comment property to fetch type comments It seems likely that, at some point over the past year, the HTML structure of the type pages changed to use a
element to embed the comment, which broke the scraper. This enables us to greatly simplify the scraping process, and to make it more consistent with properties via the use of get_inner_html(). Signed-off-by: Dan Scott --- scrapers/schema_scraper.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/scrapers/schema_scraper.py b/scrapers/schema_scraper.py index f5d4c33..3811b5d 100644 --- a/scrapers/schema_scraper.py +++ b/scrapers/schema_scraper.py @@ -53,19 +53,9 @@ def get_type_details(url): type['ancestors'] = [] for a in ancestor_links: type['ancestors'].append(a.text_content()) - el = root.cssselect("h1.page-title")[0] - type['comment'] = el.tail - type['comment_plain'] = el.tail - while el.getnext().tag not in ['div', 'h3', 'table']: - type['comment'] += lxml.etree.tostring(el.getnext()) - type['comment_plain'] += el.getnext().text_content(); - if el.getnext().tail != None: - type['comment_plain'] += el.getnext().tail - el = el.getnext() - if type['comment'] == None: - print >> sys.stderr, 'WARNING: No comment in type ' + id - type['comment'] = type['comment'].strip() - type['comment_plain'] = type['comment_plain'].strip() + el = root.cssselect("div[property='rdfs:comment']")[0] + type['comment'] = get_inner_html(el) + type['comment_plain'] = el.text_content().strip() type['instances'] = [] type['subtypes'] = [] for section in root.cssselect("h3"): From 7a636fbc7097e27ff00dc93e7943480a09648e15 Mon Sep 17 00:00:00 2001 From: Dan Scott Date: Thu, 1 Aug 2013 21:16:34 -0400 Subject: [PATCH 2/2] Properties have URLs too schema.org recently added URLs that resolve for properties, so let's track those as well. Signed-off-by: Dan Scott --- scrapers/schema_scraper.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scrapers/schema_scraper.py b/scrapers/schema_scraper.py index 3811b5d..d7e8d5c 100644 --- a/scrapers/schema_scraper.py +++ b/scrapers/schema_scraper.py @@ -89,7 +89,8 @@ def get_type_details(url): 'domains': [id], 'ranges': re.sub('\s+', ' ', row.cssselect("td.prop-ect")[0].text_content()).strip().split(' or '), 'comment': get_inner_html(comment), - 'comment_plain': comment.text_content() + 'comment_plain': comment.text_content(), + 'url': base_url + id }) return type