Skip to content

Commit 6be1d08

Browse files
committed
Fix: make the cleaner also remove javascript URLs that use escaping.
1 parent 1f534e2 commit 6be1d08

File tree

2 files changed

+6
-5
lines changed

2 files changed

+6
-5
lines changed

Diff for: src/lxml/html/clean.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@
88
import copy
99
try:
1010
from urlparse import urlsplit
11+
from urllib import unquote_plus
1112
except ImportError:
1213
# Python 3
13-
from urllib.parse import urlsplit
14+
from urllib.parse import urlsplit, unquote_plus
1415
from lxml import etree
1516
from lxml.html import defs
1617
from lxml.html import fromstring, XHTML_NAMESPACE
@@ -482,7 +483,7 @@ def _kill_elements(self, doc, condition, iterate=None):
482483

483484
def _remove_javascript_link(self, link):
484485
# links like "j a v a s c r i p t:" might be interpreted in IE
485-
new = _substitute_whitespace('', link)
486+
new = _substitute_whitespace('', unquote_plus(link))
486487
if _is_javascript_scheme(new):
487488
# FIXME: should this be None to delete?
488489
return ''

Diff for: src/lxml/html/tests/test_clean.txt

+3-3
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
... <body onload="evil_function()">
1919
... <!-- I am interpreted for EVIL! -->
2020
... <a href="javascript:evil_function()">a link</a>
21-
... <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t:evil_function()">a control char link</a>
21+
... <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t%20:evil_function()">a control char link</a>
2222
... <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
2323
... <a href="#" onclick="evil_function()">another link</a>
2424
... <p onclick="evil_function()">a paragraph</p>
@@ -51,7 +51,7 @@
5151
<body onload="evil_function()">
5252
<!-- I am interpreted for EVIL! -->
5353
<a href="javascript:evil_function()">a link</a>
54-
<a href="javascrip t:evil_function()">a control char link</a>
54+
<a href="javascrip t%20:evil_function()">a control char link</a>
5555
<a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
5656
<a href="#" onclick="evil_function()">another link</a>
5757
<p onclick="evil_function()">a paragraph</p>
@@ -84,7 +84,7 @@
8484
<body onload="evil_function()">
8585
<!-- I am interpreted for EVIL! -->
8686
<a href="javascript:evil_function()">a link</a>
87-
<a href="javascrip%20t:evil_function()">a control char link</a>
87+
<a href="javascrip%20t%20:evil_function()">a control char link</a>
8888
<a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
8989
<a href="#" onclick="evil_function()">another link</a>
9090
<p onclick="evil_function()">a paragraph</p>

0 commit comments

Comments
 (0)