diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b75d2e66..a9f6de6e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -10,7 +10,7 @@ jobs: matrix: python-version: [3.8] os: [ubuntu-latest] - mode: [lint, vendorverify, docs] + mode: [lint, vendorverify, docs, format-check, type-check] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ee4ed870..baea09bd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -8,7 +8,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.5, 3.6, 3.7, 3.8, pypy3] + python-version: [3.6, 3.7, 3.8, pypy3] os: [ubuntu-18.04, ubuntu-16.04, macos-latest, windows-latest] steps: diff --git a/CHANGES b/CHANGES index 770918a2..8d1e7ab8 100644 --- a/CHANGES +++ b/CHANGES @@ -1,6 +1,23 @@ Bleach changes ============== +Version Next (Unreleased, 2021) +------------------------------- + +**Backwards incompatible changes** + +* Clean and linkify preserve the order of HTML attributes #566. Thank you @askoretskiy. +* Drop support for Python <3.6 #520. +* Add type annotations #477. + +**Security fixes** + +None + +**Features** + +None + Version 3.3.1 (July 14th, 2021) ------------------------------- @@ -55,6 +72,7 @@ None * fix clean and linkify raising ValueErrors for certain inputs. Thank you @Google-Autofuzz. + Version 3.2.2 (January 20th, 2021) ---------------------------------- diff --git a/CONTRIBUTORS b/CONTRIBUTORS index f3b90f97..88bf04bd 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -29,6 +29,7 @@ Contributors: - Antoine Leclair - Anton Backer - Anton Kovalyov +- askoretskiy - Benjamin Peterson - Chad Birch - Chris Beaven diff --git a/SECURITY.md b/SECURITY.md index 751dfdbb..83299538 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -7,8 +7,8 @@ currently being supported with security updates. | Version | Supported | | ------- | ------------------ | -| 3.3.x | :white_check_mark: | -| < 3.2 | :x: | +| 4.0.x | :white_check_mark: | +| < 4 | :x: | ## Reporting a Vulnerability diff --git a/bleach/__init__.py b/bleach/__init__.py index d096ad51..c2fe89e0 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - import packaging.version from bleach.linkifier import ( @@ -18,9 +16,9 @@ # yyyymmdd -__releasedate__ = "20210714" +__releasedate__ = "20210803" # x.y.z or x.y.z.dev0 -- semver -__version__ = "3.3.1" +__version__ = "4.0.0" VERSION = packaging.version.Version(__version__) diff --git a/bleach/callbacks.py b/bleach/callbacks.py index 6ef4c259..43c74774 100644 --- a/bleach/callbacks.py +++ b/bleach/callbacks.py @@ -1,8 +1,15 @@ """A set of basic callbacks for bleach.linkify.""" -from __future__ import unicode_literals +from typing import Dict, Optional, Tuple -def nofollow(attrs, new=False): +def nofollow( + attrs: Dict[Tuple[Optional[str], str], str], new: Optional[bool] = False +) -> Dict[Tuple[Optional[str], str], str]: + """ + + map of ``(namespace, name)`` -> ``value`` + + """ href_key = (None, "href") if href_key not in attrs: @@ -20,7 +27,9 @@ def nofollow(attrs, new=False): return attrs -def target_blank(attrs, new=False): +def target_blank( + attrs: Dict[Tuple[Optional[str], str], str], new: Optional[bool] = False +) -> Dict[Tuple[Optional[str], str], str]: href_key = (None, "href") if href_key not in attrs: diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index b886ca50..3c9c3306 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -4,14 +4,10 @@ html5lib library without having to change a lot of code. """ -from __future__ import unicode_literals - import re import string import warnings -import six - # ignore html5lib deprecation warnings to use bleach; we are bleach # apply before we import submodules that import html5lib warnings.filterwarnings( @@ -194,7 +190,7 @@ ] -class InputStreamWithMemory(object): +class InputStreamWithMemory: """Wraps an HTMLInputStream to remember characters since last < This wraps existing HTMLInputStream classes to keep track of the stream @@ -245,7 +241,7 @@ def get_tag(self): is the "tag" that is being tokenized. """ - return six.text_type("").join(self._buffer) + return "".join(self._buffer) def start_tag(self): """Resets stream history to just '<' @@ -473,7 +469,7 @@ def convert_entity(value): code_point = int(int_as_string, base) if 0 < code_point < 0x110000: - return six.unichr(code_point) + return chr(code_point) else: return None diff --git a/bleach/linkifier.py b/bleach/linkifier.py index c7618e85..94f46a8b 100644 --- a/bleach/linkifier.py +++ b/bleach/linkifier.py @@ -1,10 +1,7 @@ -from __future__ import unicode_literals import re -import six from bleach import callbacks as linkify_callbacks from bleach import html5lib_shim -from bleach.utils import alphabetize_attributes, force_unicode #: List of default callbacks @@ -92,7 +89,7 @@ def build_email_re(tlds=TLDS): EMAIL_RE = build_email_re() -class Linker(object): +class Linker: """Convert URL-like strings in an HTML fragment to links This function converts strings that look like URLs, domain names and email @@ -157,7 +154,7 @@ def __init__( omit_optional_tags=False, # linkify does not sanitize sanitize=False, - # linkify alphabetizes + # linkify preserves attr order alphabetical_attributes=False, ) @@ -171,11 +168,9 @@ def linkify(self, text): :raises TypeError: if ``text`` is not a text type """ - if not isinstance(text, six.string_types): + if not isinstance(text, str): raise TypeError("argument must be of text type") - text = force_unicode(text) - if not text: return "" @@ -320,11 +315,10 @@ def handle_email_addresses(self, src_iter): else: # Add an "a" tag for the new link _text = attrs.pop("_text", "") - attrs = alphabetize_attributes(attrs) new_tokens.extend( [ {"type": "StartTag", "name": "a", "data": attrs}, - {"type": "Characters", "data": force_unicode(_text)}, + {"type": "Characters", "data": str(_text)}, {"type": "EndTag", "name": "a"}, ] ) @@ -443,12 +437,10 @@ def handle_links(self, src_iter): new_tokens.append({"type": "Characters", "data": prefix}) _text = attrs.pop("_text", "") - attrs = alphabetize_attributes(attrs) - new_tokens.extend( [ {"type": "StartTag", "name": "a", "data": attrs}, - {"type": "Characters", "data": force_unicode(_text)}, + {"type": "Characters", "data": str(_text)}, {"type": "EndTag", "name": "a"}, ] ) @@ -497,7 +489,7 @@ def handle_a_tag(self, token_buffer): else: new_text = attrs.pop("_text", "") - a_token["data"] = alphabetize_attributes(attrs) + a_token["data"] = attrs if text == new_text: # The callbacks didn't change the text, so we yield the new "a" @@ -511,7 +503,7 @@ def handle_a_tag(self, token_buffer): # all the tokens between the start and end "a" tags and replace # it with the new text yield a_token - yield {"type": "Characters", "data": force_unicode(new_text)} + yield {"type": "Characters", "data": str(new_text)} yield token_buffer[-1] def __iter__(self): diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index 0f5b7cc5..dbaa90d3 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -1,15 +1,12 @@ -from __future__ import unicode_literals - from itertools import chain import re +from typing import List import warnings -import six -from six.moves.urllib.parse import urlparse +from urllib.parse import urlparse from xml.sax.saxutils import unescape from bleach import html5lib_shim -from bleach.utils import alphabetize_attributes, force_unicode #: List of allowed tags @@ -37,7 +34,7 @@ } #: List of allowed styles -ALLOWED_STYLES = [] +ALLOWED_STYLES: List[str] = [] #: List of allowed protocols ALLOWED_PROTOCOLS = ["http", "https", "mailto"] @@ -55,7 +52,7 @@ INVISIBLE_REPLACEMENT_CHAR = "?" -class Cleaner(object): +class Cleaner: """Cleaner for cleaning HTML fragments of malicious content This cleaner is a security-focused function whose sole purpose is to remove @@ -146,7 +143,7 @@ def __init__( resolve_entities=False, # Bleach has its own sanitizer, so don't use the html5lib one sanitize=False, - # Bleach sanitizer alphabetizes already, so don't use the html5lib one + # clean preserves attr order alphabetical_attributes=False, ) @@ -160,7 +157,7 @@ def clean(self, text): :raises TypeError: if ``text`` is not a text type """ - if not isinstance(text, six.string_types): + if not isinstance(text, str): message = ( "argument cannot be of '{name}' type, must be of text type".format( name=text.__class__.__name__ @@ -171,8 +168,6 @@ def clean(self, text): if not text: return "" - text = force_unicode(text) - dom = self.parser.parseFragment(text) filtered = BleachSanitizerFilter( source=self.walker(dom), @@ -363,10 +358,6 @@ def sanitize_token(self, token): return None else: - if "data" in token: - # Alphabetize the attributes before calling .disallowed_token() - # so that the resulting string is stable - token["data"] = alphabetize_attributes(token["data"]) return self.disallowed_token(token) elif token_type == "Comment": @@ -557,7 +548,7 @@ def allow_token(self, token): # At this point, we want to keep the attribute, so add it in attrs[namespaced_name] = val - token["data"] = alphabetize_attributes(attrs) + token["data"] = attrs return token diff --git a/bleach/utils.py b/bleach/utils.py deleted file mode 100644 index ad780d52..00000000 --- a/bleach/utils.py +++ /dev/null @@ -1,42 +0,0 @@ -from collections import OrderedDict - -import six - - -def _attr_key(attr): - """Returns appropriate key for sorting attribute names - - Attribute names are a tuple of ``(namespace, name)`` where namespace can be - ``None`` or a string. These can't be compared in Python 3, so we conver the - ``None`` to an empty string. - - """ - key = (attr[0][0] or ""), attr[0][1] - return key - - -def alphabetize_attributes(attrs): - """Takes a dict of attributes (or None) and returns them alphabetized""" - if not attrs: - return attrs - - return OrderedDict([(k, v) for k, v in sorted(attrs.items(), key=_attr_key)]) - - -def force_unicode(text): - """Takes a text (Python 2: str/unicode; Python 3: unicode) and converts to unicode - - :arg str/unicode text: the text in question - - :returns: text as unicode - - :raises UnicodeDecodeError: if the text was a Python 2 str and isn't in - utf-8 - - """ - # If it's already unicode, then return it - if isinstance(text, six.text_type): - return text - - # If not, convert it - return six.text_type(text, "utf-8", "strict") diff --git a/docs/clean.rst b/docs/clean.rst index 9279765a..84cab3d5 100644 --- a/docs/clean.rst +++ b/docs/clean.rst @@ -173,7 +173,7 @@ attributes for specified tags: .. doctest:: - >>> from six.moves.urllib.parse import urlparse + >>> from urllib.parse import urlparse >>> import bleach >>> def allow_src(tag, name, value): @@ -371,7 +371,7 @@ Trivial Filter example: >>> cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter]) >>> dirty = 'this is cute! ' >>> cleaner.clean(dirty) - 'this is cute! ' + 'this is cute! ' .. Warning:: diff --git a/docs/linkify.rst b/docs/linkify.rst index 6d6a63ac..1804f794 100644 --- a/docs/linkify.rst +++ b/docs/linkify.rst @@ -109,7 +109,7 @@ an external link: .. doctest:: - >>> from six.moves.urllib.parse import urlparse + >>> from urllib.parse import urlparse >>> from bleach.linkifier import Linker >>> def set_target(attrs, new=False): @@ -123,7 +123,7 @@ an external link: ... >>> linker = Linker(callbacks=[set_target]) >>> linker.linkify('abc http://example.com def') - 'abc http://example.com def' + 'abc http://example.com def' Removing Attributes @@ -204,7 +204,7 @@ Example of switching all links to go through a bouncer first: .. doctest:: - >>> from six.moves.urllib.parse import quote, urlparse + >>> from urllib.parse import quote, urlparse >>> from bleach.linkifier import Linker >>> def outgoing_bouncer(attrs, new=False): diff --git a/requirements-dev.txt b/requirements-dev.txt index 73e89bdc..9a0b33e6 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -17,3 +17,9 @@ wheel # Requirements for updating the vendored html5lib hashin + +# Requirements for format-check most recently black==20.8b1 +black; implementation_name == "cpython" + +# Requirements for type-check most recently mypy==0.790 and mypy-extensions==0.4.3 +mypy; implementation_name == "cpython" diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index 8210f2f8..06348d21 100755 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -22,6 +22,8 @@ case "${MODE}" in black bleach/*.py tests/ tests_website/ ;; format-check) black --check --diff bleach/*.py tests/ tests_website/ ;; + type-check) + mypy bleach ;; # find config options in the mypy sections of setup.cfg *) echo "Unknown mode $MODE." exit 1 diff --git a/setup.cfg b/setup.cfg index 7875aeb5..5784a311 100644 --- a/setup.cfg +++ b/setup.cfg @@ -12,6 +12,25 @@ ignore = W503 max-line-length = 100 +[mypy] +files = + bleach/, + tests/, + tests_website/, + +python_version = 3.6 +ignore_missing_imports = True +strict = False + +show_error_context = True +show_column_numbers = True +show_error_codes = True +pretty = True +error_summary = True + +[mypy-bleach._vendor.*] +ignore_errors = True + [tool:pytest] addopts = -W error:html5lib:DeprecationWarning diff --git a/setup.py b/setup.py index e53002fa..cf0f8525 100755 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ def get_version(): include_package_data=True, package_data={'': ['README.rst']}, zip_safe=False, - python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*', + python_requires='>=3.6', install_requires=install_requires, classifiers=[ 'Development Status :: 5 - Production/Stable', @@ -54,10 +54,7 @@ def get_version(): 'License :: OSI Approved :: Apache Software License', 'Operating System :: OS Independent', 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3 :: Only', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py index 121d14bf..69441e75 100644 --- a/tests/test_callbacks.py +++ b/tests/test_callbacks.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from bleach.callbacks import nofollow, target_blank diff --git a/tests/test_clean.py b/tests/test_clean.py index 0b7570c7..606b4ea8 100644 --- a/tests/test_clean.py +++ b/tests/test_clean.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import os import pytest @@ -574,7 +572,7 @@ def test_svg_attr_val_allows_ref(): [ ( '', - '', + '', ), ( '', @@ -1065,6 +1063,13 @@ def test_regressions(test_case): assert clean(test_data) == expected +def test_preserves_attributes_order(): + html = """Link""" + cleaned_html = clean(html, tags=["a"], attributes={"a": ["href", "target"]}) + + assert cleaned_html == html + + class TestCleaner: def test_basics(self): TAGS = ["span", "br"] @@ -1094,4 +1099,4 @@ def __iter__(self): cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter]) dirty = 'this is cute! ' - assert cleaner.clean(dirty) == 'this is cute! ' + assert cleaner.clean(dirty) == 'this is cute! ' diff --git a/tests/test_css.py b/tests/test_css.py index 43bdc4b6..50b47701 100644 --- a/tests/test_css.py +++ b/tests/test_css.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from functools import partial from timeit import timeit diff --git a/tests/test_html5lib_shim.py b/tests/test_html5lib_shim.py index 5a836bcd..22de03bf 100644 --- a/tests/test_html5lib_shim.py +++ b/tests/test_html5lib_shim.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import pytest from bleach import html5lib_shim diff --git a/tests/test_linkify.py b/tests/test_linkify.py index b5704a1e..a215da51 100644 --- a/tests/test_linkify.py +++ b/tests/test_linkify.py @@ -1,9 +1,7 @@ -from __future__ import unicode_literals - import re import pytest -from six.moves.urllib_parse import quote_plus +from urllib.parse import quote_plus from bleach import linkify, DEFAULT_CALLBACKS as DC from bleach.linkifier import Linker, LinkifyFilter diff --git a/tests/test_unicode.py b/tests/test_unicode.py index 50538039..db3545e1 100644 --- a/tests/test_unicode.py +++ b/tests/test_unicode.py @@ -1,6 +1,4 @@ # -*- coding: utf-8 -*- -from __future__ import unicode_literals - import pytest from bleach import clean, linkify diff --git a/tests/test_utils.py b/tests/test_utils.py deleted file mode 100644 index 76d1c71f..00000000 --- a/tests/test_utils.py +++ /dev/null @@ -1,23 +0,0 @@ -from collections import OrderedDict - -from bleach.utils import alphabetize_attributes - - -class TestAlphabeticalAttributes: - def test_empty_cases(self): - assert alphabetize_attributes(None) is None - - assert alphabetize_attributes({}) == {} - - def test_ordering(self): - assert alphabetize_attributes({(None, "a"): 1, (None, "b"): 2}) == OrderedDict( - [((None, "a"), 1), ((None, "b"), 2)] - ) - assert alphabetize_attributes({(None, "b"): 1, (None, "a"): 2}) == OrderedDict( - [((None, "a"), 2), ((None, "b"), 1)] - ) - - def test_different_namespaces(self): - assert alphabetize_attributes( - {("xlink", "href"): "abc", (None, "alt"): "123"} - ) == OrderedDict([((None, "alt"), "123"), (("xlink", "href"), "abc")]) diff --git a/tests_website/index.html b/tests_website/index.html index b4a92f9d..95df20f0 100644 --- a/tests_website/index.html +++ b/tests_website/index.html @@ -2,7 +2,7 @@ - Python Bleach 3.3.0 + Python Bleach 4.0.0 -

Python Bleach 3.3.0

+

Python Bleach 4.0.0

pypi version Build Status diff --git a/tests_website/open_test_page.py b/tests_website/open_test_page.py index d1d8c127..4a47be8b 100755 --- a/tests_website/open_test_page.py +++ b/tests_website/open_test_page.py @@ -1,9 +1,10 @@ #!/usr/bin/env python +from typing import Set import webbrowser -TEST_BROWSERS = { +TEST_BROWSERS: Set[str] = { # 'mozilla', "firefox", # 'netscape', @@ -28,9 +29,12 @@ # 'chromium', # 'chromium-browser', } -REGISTERED_BROWSERS = set(webbrowser._browsers.keys()) if __name__ == "__main__": - for b in TEST_BROWSERS & REGISTERED_BROWSERS: - webbrowser.get(b).open_new_tab("http://localhost:8080") + for browser_name in TEST_BROWSERS: + try: + browser = webbrowser.get(browser_name) + browser.open_new_tab("http://localhost:8080") + except Exception as error: + print("error getting test browser %s: %s" % (browser_name, error)) diff --git a/tests_website/server.py b/tests_website/server.py index 834729f9..2d25ea25 100755 --- a/tests_website/server.py +++ b/tests_website/server.py @@ -10,7 +10,8 @@ """ -import six +import http.server +import socketserver import bleach @@ -18,17 +19,17 @@ PORT = 8080 -class BleachCleanHandler(six.moves.SimpleHTTPServer.SimpleHTTPRequestHandler): +class BleachCleanHandler(http.server.SimpleHTTPRequestHandler): + + # Prevent 'cannot bind to address' errors on restart + allow_reuse_address = True + def do_POST(self): - if six.PY2: - content_len = int(self.headers.getheader("content-length", 0)) - else: - content_len = int(self.headers.get("content-length", 0)) + content_len = int(self.headers.get("content-length", 0)) body = self.rfile.read(content_len) print("read %s bytes: %s" % (content_len, body)) - if six.PY3: - body = body.decode("utf-8") + body = body.decode("utf-8") print("input: %r" % body) cleaned = bleach.clean(body) @@ -37,16 +38,12 @@ def do_POST(self): self.send_header("Content-Type", "text/plain;charset=UTF-8") self.end_headers() - if six.PY3: - cleaned = bytes(cleaned, encoding="utf-8") + cleaned = bytes(cleaned, encoding="utf-8") print("cleaned: %r" % cleaned) self.wfile.write(cleaned) if __name__ == "__main__": - # Prevent 'cannot bind to address' errors on restart - six.moves.socketserver.TCPServer.allow_reuse_address = True - - httpd = six.moves.socketserver.TCPServer(("127.0.0.1", PORT), BleachCleanHandler) + httpd = socketserver.TCPServer(("127.0.0.1", PORT), BleachCleanHandler) print("listening on localhost port %d" % PORT) httpd.serve_forever() diff --git a/tox.ini b/tox.ini index 08f119ad..d047904d 100644 --- a/tox.ini +++ b/tox.ini @@ -2,8 +2,8 @@ [tox] envlist = - py{27,35,36,37,38,py,py3} - py{27,35,36,37,38}-build-no-lang + py{36,37,38,py3} + py{36,37,38}-build-no-lang docs format-check lint @@ -16,18 +16,6 @@ commands = pytest {posargs:-v} python setup.py build -[testenv:py27-build-no-lang] -setenv = - LANG= -commands = - python setup.py build - -[testenv:py35-build-no-lang] -setenv = - LANG= -commands = - python setup.py build - [testenv:py36-build-no-lang] setenv = LANG= @@ -61,10 +49,17 @@ basepython = python3.8 changedir = scripts deps = -rrequirements-dev.txt - black commands = ./run_tests.sh format-check +[testenv:type-check] +basepython = python3.8 +changedir = scripts +deps = + -rrequirements-dev.txt +commands = + ./run_tests.sh type-check + [testenv:docs] basepython = python3.8 changedir = docs