From f0ca526838163a7e80acd700bf9191df0076c182 Mon Sep 17 00:00:00 2001
From: Dan Scott <dan@coffeecode.net>
Date: Thu, 1 Aug 2013 21:01:24 -0400
Subject: [PATCH 1/2] Use the rdfs:comment property to fetch type comments

It seems likely that, at some point over the past year, the HTML
structure of the type pages changed to use a <div
property="rdfs:comment"> element to embed the comment, which broke the
scraper.

This enables us to greatly simplify the scraping process, and to make it
more consistent with properties via the use of get_inner_html().

Signed-off-by: Dan Scott <dan@coffeecode.net>
---
 scrapers/schema_scraper.py | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)
diff --git a/scrapers/schema_scraper.py b/scrapers/schema_scraper.py
index f5d4c33..3811b5d 100644
--- a/scrapers/schema_scraper.py
+++ b/scrapers/schema_scraper.py
@@ -53,19 +53,9 @@ def get_type_details(url):
     type['ancestors'] = []
     for a in ancestor_links:
         type['ancestors'].append(a.text_content())
-    el = root.cssselect("h1.page-title")[0]
-    type['comment'] = el.tail
-    type['comment_plain'] = el.tail
-    while el.getnext().tag not in ['div', 'h3', 'table']:
-        type['comment'] += lxml.etree.tostring(el.getnext())
-        type['comment_plain'] += el.getnext().text_content();
-        if el.getnext().tail != None:
-            type['comment_plain'] += el.getnext().tail
-        el = el.getnext()
-    if type['comment'] == None:
-        print >> sys.stderr, 'WARNING: No comment in type ' + id
-    type['comment'] = type['comment'].strip()
-    type['comment_plain'] = type['comment_plain'].strip()
+    el = root.cssselect("div[property='rdfs:comment']")[0]
+    type['comment'] = get_inner_html(el)
+    type['comment_plain'] = el.text_content().strip()
     type['instances'] = []
     type['subtypes'] = []
     for section in root.cssselect("h3"):

From 7a636fbc7097e27ff00dc93e7943480a09648e15 Mon Sep 17 00:00:00 2001
From: Dan Scott <dan@coffeecode.net>
Date: Thu, 1 Aug 2013 21:16:34 -0400
Subject: [PATCH 2/2] Properties have URLs too

schema.org recently added URLs that resolve for properties, so let's
track those as well.

Signed-off-by: Dan Scott <dan@coffeecode.net>
---
 scrapers/schema_scraper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scrapers/schema_scraper.py b/scrapers/schema_scraper.py
index 3811b5d..d7e8d5c 100644
--- a/scrapers/schema_scraper.py
+++ b/scrapers/schema_scraper.py
@@ -89,7 +89,8 @@ def get_type_details(url):
                 'domains': [id],
                 'ranges': re.sub('\s+', ' ', row.cssselect("td.prop-ect")[0].text_content()).strip().split(' or '),
                 'comment': get_inner_html(comment),
-                'comment_plain': comment.text_content()
+                'comment_plain': comment.text_content(),
+                'url': base_url + id
         })
     return type