diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index adc3f450e..11da2958e 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -8,9 +8,10 @@ import copy try: from urlparse import urlsplit + from urllib import unquote_plus except ImportError: # Python 3 - from urllib.parse import urlsplit + from urllib.parse import urlsplit, unquote_plus from lxml import etree from lxml.html import defs from lxml.html import fromstring, XHTML_NAMESPACE @@ -482,7 +483,7 @@ def _kill_elements(self, doc, condition, iterate=None): def _remove_javascript_link(self, link): # links like "j a v a s c r i p t:" might be interpreted in IE - new = _substitute_whitespace('', link) + new = _substitute_whitespace('', unquote_plus(link)) if _is_javascript_scheme(new): # FIXME: should this be None to delete? return '' diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt index c78ab4f13..2824f64ce 100644 --- a/src/lxml/html/tests/test_clean.txt +++ b/src/lxml/html/tests/test_clean.txt @@ -18,7 +18,7 @@ ... ... ... a link -... a control char link +... a control char link ... data ... another link ...

a paragraph

@@ -51,7 +51,7 @@ a link - a control char link + a control char link data another link

a paragraph

@@ -84,7 +84,7 @@ a link - a control char link + a control char link data another link

a paragraph