From 6be1d081b49c97cfd7b3fbd934a193b668629109 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sun, 9 Sep 2018 16:44:17 +0200 Subject: [PATCH 1/4] Fix: make the cleaner also remove javascript URLs that use escaping. --- src/lxml/html/clean.py | 5 +++-- src/lxml/html/tests/test_clean.txt | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index adc3f450e..11da2958e 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -8,9 +8,10 @@ import copy try: from urlparse import urlsplit + from urllib import unquote_plus except ImportError: # Python 3 - from urllib.parse import urlsplit + from urllib.parse import urlsplit, unquote_plus from lxml import etree from lxml.html import defs from lxml.html import fromstring, XHTML_NAMESPACE @@ -482,7 +483,7 @@ def _kill_elements(self, doc, condition, iterate=None): def _remove_javascript_link(self, link): # links like "j a v a s c r i p t:" might be interpreted in IE - new = _substitute_whitespace('', link) + new = _substitute_whitespace('', unquote_plus(link)) if _is_javascript_scheme(new): # FIXME: should this be None to delete? return '' diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt index c78ab4f13..2824f64ce 100644 --- a/src/lxml/html/tests/test_clean.txt +++ b/src/lxml/html/tests/test_clean.txt @@ -18,7 +18,7 @@ ... ... ... a link -... a control char link +... a control char link ... data ... another link ...

a paragraph

@@ -51,7 +51,7 @@ a link - a control char link + a control char link data another link

a paragraph

@@ -84,7 +84,7 @@ a link - a control char link + a control char link data another link

a paragraph

From 26dfc89c8f6e603487bac4f4476993a70ce695d3 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sun, 9 Sep 2018 17:00:48 +0200 Subject: [PATCH 2/4] Prepare release of lxml 4.2.5. --- CHANGES.txt | 10 ++++++++++ doc/main.txt | 10 +++++++--- tools/manylinux/build-wheels.sh | 8 ++++++-- version.txt | 2 +- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 06ca52d75..7e2814b6f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,16 @@ lxml changelog ============== +4.2.5 (2018-09-09) +================== + +Bugs fixed +---------- + +* Javascript URLs that used URL escaping were not removed by the HTML cleaner. + Security problem found by Omar Eissa. + + 4.2.4 (2018-08-03) ================== diff --git a/doc/main.txt b/doc/main.txt index ffc6539c2..0ca560d48 100644 --- a/doc/main.txt +++ b/doc/main.txt @@ -157,8 +157,8 @@ Index `_ (PyPI). It has the source that compiles on various platforms. The source distribution is signed with `this key `_. -The latest version is `lxml 4.2.4`_, released 2018-08-03 -(`changes for 4.2.4`_). `Older versions <#old-versions>`_ +The latest version is `lxml 4.2.5`_, released 2018-09-09 +(`changes for 4.2.5`_). `Older versions <#old-versions>`_ are listed below. Please take a look at the @@ -250,7 +250,9 @@ See the websites of lxml .. and the `latest in-development version `_. -.. _`PDF documentation`: lxmldoc-4.2.4.pdf +.. _`PDF documentation`: lxmldoc-4.2.5.pdf + +* `lxml 4.2.5`_, released 2018-09-09 (`changes for 4.2.5`_) * `lxml 4.2.4`_, released 2018-08-03 (`changes for 4.2.4`_) @@ -272,6 +274,7 @@ See the websites of lxml * `older releases `_ +.. _`lxml 4.2.5`: /files/lxml-4.2.5.tgz .. _`lxml 4.2.4`: /files/lxml-4.2.4.tgz .. _`lxml 4.2.3`: /files/lxml-4.2.3.tgz .. _`lxml 4.2.2`: /files/lxml-4.2.2.tgz @@ -282,6 +285,7 @@ See the websites of lxml .. _`lxml 4.0.0`: /files/lxml-4.0.0.tgz .. _`lxml 3.8.0`: /files/lxml-3.8.0.tgz +.. _`changes for 4.2.5`: /changes-4.2.5.html .. _`changes for 4.2.4`: /changes-4.2.4.html .. _`changes for 4.2.3`: /changes-4.2.3.html .. _`changes for 4.2.2`: /changes-4.2.2.html diff --git a/tools/manylinux/build-wheels.sh b/tools/manylinux/build-wheels.sh index da748fbc4..531091e65 100755 --- a/tools/manylinux/build-wheels.sh +++ b/tools/manylinux/build-wheels.sh @@ -24,12 +24,16 @@ build_wheel() { -w /io/$WHEELHOUSE } -assert_importable() { +run_tests() { # Install packages and test for PYBIN in /opt/python/*/bin/; do ${PYBIN}/pip install $PACKAGE --no-index -f /io/$WHEELHOUSE + # check import as a quick test (cd $HOME; ${PYBIN}/python -c 'import lxml.etree, lxml.objectify') + + # run tests + (cd $HOME; ${PYBIN}/python /io/test.py) done } @@ -76,5 +80,5 @@ show_wheels() { prepare_system build_wheels repair_wheels -assert_importable +run_tests show_wheels diff --git a/version.txt b/version.txt index cf78d5b6a..df0228dfa 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -4.2.4 +4.2.5 From 171eaaa30a0ac0f572c932ed04d5029af53b6bd1 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sun, 26 Aug 2018 08:59:30 +0200 Subject: [PATCH 3/4] Fix typo in test file. --- src/lxml/html/tests/test_html5parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lxml/html/tests/test_html5parser.py b/src/lxml/html/tests/test_html5parser.py index 6a4eba577..241517ea3 100644 --- a/src/lxml/html/tests/test_html5parser.py +++ b/src/lxml/html/tests/test_html5parser.py @@ -328,7 +328,7 @@ def make_temp_file(self, contents=''): try: tmpfile.close() finally: - os.unlink(tempfile.name) + os.unlink(tmpfile.name) raise def test_with_file_object(self): From 1dee355e83b1f524de7a772a8da941a186036bc2 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sun, 9 Sep 2018 17:16:33 +0200 Subject: [PATCH 4/4] Py3 syntax fix in helper script. --- doc/rest2html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/rest2html.py b/doc/rest2html.py index a645062bf..6438df32e 100755 --- a/doc/rest2html.py +++ b/doc/rest2html.py @@ -38,7 +38,7 @@ def pygments_directive(name, arguments, options, content, lineno, content_offset, block_text, state, state_machine): try: lexer = get_lexer_by_name(arguments[0]) - except ValueError, e: + except ValueError: # no lexer found - use the text one instead of an exception lexer = TextLexer() # take an arbitrary option if more than one is given