Merge pull request #708 from okfn/708-markdown-fixes

Markdown fixes
ckan · Apr 17, 2013 · cb9849f · cb9849f
2 parents 250233e + 85a3d3e
commit cb9849f
Show file tree

Hide file tree

Showing 16 changed files with 138 additions and 265 deletions.
diff --git a/ckan/controllers/api.py b/ckan/controllers/api.py
@@ -571,7 +571,7 @@ def _get_search_params(cls, request_params):
 
     def markdown(self, ver=None):
         raw_markdown = request.params.get('q', '')
-        results = ckan.misc.MarkdownFormat().to_html(raw_markdown)
+        results = h.render_markdown(raw_markdown)
 
         return self._finish_ok(results)
 

diff --git a/ckan/controllers/group.py b/ckan/controllers/group.py
@@ -201,14 +201,7 @@ def _read(self, id, limit):
         else:
             q += ' groups:"%s"' % c.group_dict.get('name')
 
-        try:
-            description_formatted = ckan.misc.MarkdownFormat().to_html(
-                c.group_dict.get('description', ''))
-            c.description_formatted = genshi.HTML(description_formatted)
-        except Exception, e:
-            error_msg = "<span class='inline-warning'>%s</span>" %\
-                        _("Cannot render description")
-            c.description_formatted = genshi.HTML(error_msg)
+        c.description_formatted = h.render_markdown(c.group_dict.get('description'))
 
         context['return_query'] = True
 

diff --git a/ckan/controllers/user.py b/ckan/controllers/user.py
@@ -2,11 +2,9 @@
 from urllib import quote
 
 from pylons import config
-import genshi
 
 import ckan.lib.i18n as i18n
 import ckan.lib.base as base
-import ckan.misc as misc
 import ckan.model as model
 import ckan.lib.helpers as h
 import ckan.new_authz as new_authz
@@ -73,7 +71,7 @@ def _setup_template_variables(self, context, data_dict):
             abort(401, _('Not authorized to see this page'))
         c.user_dict = user_dict
         c.is_myself = user_dict['name'] == c.user
-        c.about_formatted = self._format_about(user_dict['about'])
+        c.about_formatted = h.render_markdown(user_dict['about'])
 
     ## end hooks
 
@@ -622,13 +620,3 @@ def unfollow(self, id):
                              or e.error_dict)
             h.flash_error(error_message)
         h.redirect_to(controller='user', action='read', id=id)
-
-    def _format_about(self, about):
-        about_formatted = misc.MarkdownFormat().to_html(about)
-        try:
-            html = genshi.HTML(about_formatted)
-        except genshi.ParseError, e:
-            log.error('Could not print "about" field Field: %r Error: %r',
-                      about, e)
-            html = _('Error: Could not parse About text')
-        return html
diff --git a/ckan/lib/create_test_data.py b/ckan/lib/create_test_data.py
@@ -464,7 +464,7 @@ def create(cls, auth_profile="", package_type=None):
         model.Session.add_all([
             model.User(name=u'tester', apikey=u'tester', password=u'tester'),
             model.User(name=u'joeadmin', password=u'joeadmin'),
-            model.User(name=u'annafan', about=u'I love reading Annakarenina. My site: <a href="http://anna.com">anna.com</a>', password=u'annafan'),
+            model.User(name=u'annafan', about=u'I love reading Annakarenina. My site: http://anna.com', password=u'annafan'),
             model.User(name=u'russianfan', password=u'russianfan'),
             sysadmin,
             ])

diff --git a/ckan/lib/dictization/model_dictize.py b/ckan/lib/dictization/model_dictize.py
@@ -4,7 +4,6 @@
 from pylons import config
 from sqlalchemy.sql import select
 
-import ckan.misc as misc
 import ckan.logic as logic
 import ckan.plugins as plugins
 import ckan.lib.helpers as h
@@ -530,7 +529,7 @@ def package_to_api(pkg, context):
     dictized['license'] = pkg.license.title if pkg.license else None
     dictized['ratings_average'] = pkg.get_average_rating()
     dictized['ratings_count'] = len(pkg.ratings)
-    dictized['notes_rendered'] = misc.MarkdownFormat().to_html(pkg.notes)
+    dictized['notes_rendered'] = h.render_markdown(pkg.notes)
 
     site_url = config.get('ckan.site_url', None)
     if site_url:

diff --git a/ckan/lib/helpers.py b/ckan/lib/helpers.py
@@ -636,7 +636,7 @@ def markdown_extract(text, extract_length=190):
     will not be truncated.'''
     if (text is None) or (text.strip() == ''):
         return ''
-    plain = re.sub(r'<.*?>', '', markdown(text))
+    plain = RE_MD_HTML_TAGS.sub('', markdown(text))
     if not extract_length or len(plain) < extract_length:
         return literal(plain)
     return literal(unicode(truncate(plain, length=extract_length, indicator='...', whole_word=True)))
@@ -956,7 +956,7 @@ def related_item_link(related_item_dict):
 
 def tag_link(tag):
     url = url_for(controller='tag', action='read', id=tag['name'])
-    return link_to(tag['name'], url)
+    return link_to(tag.get('title', tag['name']), url)
 
 
 def group_link(group):
@@ -1367,12 +1367,77 @@ def get_request_param(parameter_name, default=None):
     return request.params.get(parameter_name, default)
 
 
-def render_markdown(data):
+# find all inner text of html eg `<b>moo</b>` gets `moo` but not of <a> tags
+# as this would lead to linkifying links if they are urls.
+RE_MD_GET_INNER_HTML = re.compile(
+    r'(^|(?:<(?!a\b)[^>]*>))([^<]+)(?=<|$)',
+    flags=re.UNICODE
+)
+
+# find all `internal links` eg. tag:moo, dataset:1234, tag:"my tag"
+RE_MD_INTERNAL_LINK = re.compile(
+    r'\b(tag|package|dataset|group):((")?(?(3)[ \w\-.]+|[\w\-.]+)(?(3)"))',
+    flags=re.UNICODE
+)
+
+# find external links eg http://foo.com, https://bar.org/foobar.html
+RE_MD_EXTERNAL_LINK = re.compile(
+    r'(\bhttps?:\/\/[\w\-\.,@?^=%&;:\/~\\+#]*)',
+    flags=re.UNICODE
+)
+
+# find all tags but ignore < in the strings so that we can use it correctly
+# in markdown
+RE_MD_HTML_TAGS = re.compile('<[^><]*>')
+
+
+def html_auto_link(data):
+    '''Linkifies HTML
+
+    tag:... converted to a tag link
+    dataset:... converted to a dataset link
+    group:... converted to a group link
+    http://... converted to a link
+    '''
+
+    LINK_FNS = {
+        'tag': tag_link,
+        'group': group_link,
+        'dataset': dataset_link,
+        'package': dataset_link,
+    }
+
+    def makelink(matchobj):
+        obj = matchobj.group(1)
+        name = matchobj.group(2)
+        title = '%s:%s' % (obj, name)
+        return LINK_FNS[obj]({'name': name.strip('"'), 'title': title})
+
+    def link(matchobj):
+        return '<a href="%s" target="_blank" rel="nofollow">%s</a>' \
+            % (matchobj.group(1), matchobj.group(1))
+
+    def process(matchobj):
+        data = matchobj.group(2)
+        data = RE_MD_INTERNAL_LINK.sub(makelink, data)
+        data = RE_MD_EXTERNAL_LINK.sub(link, data)
+        return matchobj.group(1) + data
+
+    data = RE_MD_GET_INNER_HTML.sub(process, data)
+    return data
+
+
+def render_markdown(data, auto_link=True):
     ''' returns the data as rendered markdown '''
-    # cope with data == None
     if not data:
         return ''
-    return literal(ckan.misc.MarkdownFormat().to_html(data))
+    data = RE_MD_HTML_TAGS.sub('', data.strip())
+    data = markdown(data, safe_mode=True)
+    # tags can be added by tag:... or tag:"...." and a link will be made
+    # from it
+    if auto_link:
+        data = html_auto_link(data)
+    return literal(data)
 
 
 def format_resource_items(items):

diff --git a/ckan/lib/package_saver.py b/ckan/lib/package_saver.py
@@ -1,4 +1,3 @@
-import genshi
 from sqlalchemy import orm
 import ckan.lib.helpers as h
 from ckan.lib.base import *
@@ -22,12 +21,8 @@ def render_package(cls, pkg, context):
         render. 
         Note that the actual calling of render('package/read') is left
         to the caller.'''
-        try:
-            notes_formatted = ckan.misc.MarkdownFormat().to_html(pkg.get('notes',''))
-            c.pkg_notes_formatted = genshi.HTML(notes_formatted)
-        except Exception, e:
-            error_msg = "<span class='inline-warning'>%s</span>" % _("Cannot render package description")
-            c.pkg_notes_formatted = genshi.HTML(error_msg)
+        c.pkg_notes_formatted = h.render_markdown(pkg.get('notes'))
+
         c.current_rating, c.num_ratings = ckan.rating.get_rating(context['package'])
         url = pkg.get('url', '')
         c.pkg_url_link = h.link_to(url, url, rel='foaf:homepage', target='_blank') \

diff --git a/ckan/misc.py b/ckan/misc.py
diff --git a/ckan/model/package.py b/ckan/model/package.py
@@ -17,7 +17,6 @@
 import activity
 import extension
 
-import ckan.misc
 import ckan.lib.dictization
 
 __all__ = ['Package', 'package_table', 'package_revision_table',
@@ -216,7 +215,8 @@ def as_dict(self, ref_package_by='name', ref_group_by='name'):
             if self.metadata_modified else None
         _dict['metadata_created'] = self.metadata_created.isoformat() \
             if self.metadata_created else None
-        _dict['notes_rendered'] = ckan.misc.MarkdownFormat().to_html(self.notes)
+        import ckan.lib.helpers as h
+        _dict['notes_rendered'] = h.render_markdown(self.notes)
         _dict['type'] = self.type or u'dataset'
         #tracking
         import ckan.model as model

diff --git a/ckan/templates_legacy/package/read_core.html b/ckan/templates_legacy/package/read_core.html
@@ -10,7 +10,7 @@
     <div id="dataset-overview">
 
       <!-- Description -->
-      <div class="notes" py:if="str(c.pkg_notes_formatted).strip()">
+      <div class="notes" py:if="c.pkg_notes_formatted">
         <div id="notes-extract">
           ${c.pkg_notes_formatted}
         </div>

diff --git a/ckan/tests/functional/test_package.py b/ckan/tests/functional/test_package.py
@@ -309,10 +309,6 @@ def test_read(self):
         assert anna.version in res
         assert anna.url in res
         assert 'Some test notes' in res
-        self.check_named_element(res, 'a',
-                                 'http://ckan.net/',
-                                 'target="_blank"',
-                                 'rel="nofollow"')
         assert '<strong>Some bolded text.</strong>' in res
         self.check_tag_and_data(res, 'left arrow', '&lt;')
         self.check_tag_and_data(res, 'umlaut', u'\xfc')
@@ -350,17 +346,17 @@ def test_read_internal_links(self):
         pkg_name = u'link-test',
         CreateTestData.create_arbitrary([
             {'name':pkg_name,
-             'notes':'Decoy link here: decoy:decoy, real links here: package:pkg-1, ' \
+             'notes':'Decoy link here: decoy:decoy, real links here: dataset:pkg-1, ' \
                    'tag:tag_1 group:test-group-1 and a multi-word tag: tag:"multi word with punctuation."',
              }
             ])
         offset = url_for(controller='package', action='read', id=pkg_name)
         res = self.app.get(offset)
         def check_link(res, controller, id):
             id_in_uri = id.strip('"').replace(' ', '%20') # remove quotes and percent-encode spaces
-            self.check_tag_and_data(res, 'a ', '/%s/%s' % (controller, id_in_uri),
-                                    '%s:%s' % (controller, id))
-        check_link(res, 'package', 'pkg-1')
+            self.check_tag_and_data(res, 'a ', '%s/%s' % (controller, id_in_uri),
+                                    '%s:%s' % (controller, id.replace('"', '&#34;')))
+        check_link(res, 'dataset', 'pkg-1')
         check_link(res, 'tag', 'tag_1')
         check_link(res, 'tag', '"multi word with punctuation."')
         check_link(res, 'group', 'test-group-1')
@@ -1557,10 +1553,10 @@ def teardown(self):
 
     def test_markdown_html_whitelist(self):
         self.body = str(self.res)
-        self.assert_fragment('<table width="100%" border="1">')
-        self.assert_fragment('<td rowspan="2"><b>Description</b></td>')
-        self.assert_fragment('<a href="http://www.nber.org/patents/subcategories.txt" target="_blank" rel="nofollow">subcategory.txt</a>')
-        self.assert_fragment('<td colspan="2"><center>--</center></td>')
+        self.fail_if_fragment('<table width="100%" border="1">')
+        self.fail_if_fragment('<td rowspan="2"><b>Description</b></td>')
+        self.fail_if_fragment('<a href="http://www.nber.org/patents/subcategories.txt" target="_blank" rel="nofollow">subcategory.txt</a>')
+        self.fail_if_fragment('<td colspan="2"><center>--</center></td>')
         self.fail_if_fragment('<script>')
 
     def assert_fragment(self, fragment):