mozilla · g-k · Nov 5, 2020 · Nov 5, 2020 · Nov 5, 2020 · Nov 5, 2020
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -10,7 +10,7 @@ jobs:
       matrix:
         python-version: [3.8]
-        python-version: [3.8]
+        python-version: [3.9]
-        python-version: [3.8]
+        python-version: [3.9]
         os: [ubuntu-latest]
-        mode: [lint, vendorverify, docs]
+        mode: [lint, vendorverify, docs, format-check, type-check]
 
     steps:
       - uses: actions/checkout@v2

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -8,7 +8,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.5, 3.6, 3.7, 3.8, pypy3]
+        python-version: [3.6, 3.7, 3.8, pypy3]
-        python-version: [3.6, 3.7, 3.8, pypy3]
+        python-version: [3.6, 3.7, 3.8, 3.9, pypy3]
-        python-version: [3.6, 3.7, 3.8, pypy3]
+        python-version: [3.6, 3.7, 3.8, 3.9, pypy3]
         os: [ubuntu-18.04, ubuntu-16.04, macos-latest, windows-latest]
 
     steps:

diff --git a/CHANGES b/CHANGES
@@ -1,6 +1,23 @@
 Bleach changes
 ==============
 
+Version Next (Unreleased, 2021)
+-------------------------------
+
+**Backwards incompatible changes**
+
+* Clean and linkify preserve the order of HTML attributes #566. Thank you @askoretskiy.
+* Drop support for Python <3.6 #520.
+* Add type annotations #477.
+
+**Security fixes**
+
+None
+
+**Features**
+
+None
+
 Version 3.3.1 (July 14th, 2021)
 -------------------------------
 
@@ -55,6 +72,7 @@ None
 
 * fix clean and linkify raising ValueErrors for certain inputs. Thank you @Google-Autofuzz.
 
+
 Version 3.2.2 (January 20th, 2021)
 ----------------------------------
 

diff --git a/CONTRIBUTORS b/CONTRIBUTORS
@@ -29,6 +29,7 @@ Contributors:
 - Antoine Leclair
 - Anton Backer
 - Anton Kovalyov
+- askoretskiy
 - Benjamin Peterson
 - Chad Birch
 - Chris Beaven

diff --git a/SECURITY.md b/SECURITY.md
@@ -7,8 +7,8 @@ currently being supported with security updates.
 
 | Version | Supported          |
 | ------- | ------------------ |
-| 3.3.x   | :white_check_mark: |
-| < 3.2   | :x:                |
+| 4.0.x   | :white_check_mark: |
+| < 4     | :x:                |
 
 ## Reporting a Vulnerability
 

diff --git a/bleach/__init__.py b/bleach/__init__.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
 
-from __future__ import unicode_literals
-
 import packaging.version
 
 from bleach.linkifier import (
@@ -18,9 +16,9 @@
 
 
 # yyyymmdd
-__releasedate__ = "20210714"
+__releasedate__ = "20210803"
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = "3.3.1"
+__version__ = "4.0.0"
 VERSION = packaging.version.Version(__version__)
 
 

diff --git a/bleach/callbacks.py b/bleach/callbacks.py
@@ -1,8 +1,15 @@
 """A set of basic callbacks for bleach.linkify."""
-from __future__ import unicode_literals
+from typing import Dict, Optional, Tuple
 
 
-def nofollow(attrs, new=False):
+def nofollow(
+    attrs: Dict[Tuple[Optional[str], str], str], new: Optional[bool] = False
+) -> Dict[Tuple[Optional[str], str], str]:
+    """
+
+    map of ``(namespace, name)`` -> ``value``
+
+    """
     href_key = (None, "href")
 
     if href_key not in attrs:
@@ -20,7 +27,9 @@ def nofollow(attrs, new=False):
     return attrs
 
 
-def target_blank(attrs, new=False):
+def target_blank(
+    attrs: Dict[Tuple[Optional[str], str], str], new: Optional[bool] = False
+) -> Dict[Tuple[Optional[str], str], str]:
     href_key = (None, "href")
 
     if href_key not in attrs:

diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py
@@ -4,14 +4,10 @@
 html5lib library without having to change a lot of code.
 """
 
-from __future__ import unicode_literals
-
 import re
 import string
 import warnings
 
-import six
-
 # ignore html5lib deprecation warnings to use bleach; we are bleach
 # apply before we import submodules that import html5lib
 warnings.filterwarnings(
@@ -194,7 +190,7 @@
 ]
 
 
-class InputStreamWithMemory(object):
+class InputStreamWithMemory:
     """Wraps an HTMLInputStream to remember characters since last <
 
     This wraps existing HTMLInputStream classes to keep track of the stream
@@ -245,7 +241,7 @@ def get_tag(self):
         is the "tag" that is being tokenized.
 
         """
-        return six.text_type("").join(self._buffer)
+        return "".join(self._buffer)
 
     def start_tag(self):
         """Resets stream history to just '<'
@@ -473,7 +469,7 @@ def convert_entity(value):
 
         code_point = int(int_as_string, base)
         if 0 < code_point < 0x110000:
-            return six.unichr(code_point)
+            return chr(code_point)
         else:
             return None
 

diff --git a/bleach/linkifier.py b/bleach/linkifier.py
@@ -1,10 +1,7 @@
-from __future__ import unicode_literals
 import re
-import six
 
 from bleach import callbacks as linkify_callbacks
 from bleach import html5lib_shim
-from bleach.utils import alphabetize_attributes, force_unicode
 
 
 #: List of default callbacks
@@ -92,7 +89,7 @@ def build_email_re(tlds=TLDS):
 EMAIL_RE = build_email_re()
 
 
-class Linker(object):
+class Linker:
     """Convert URL-like strings in an HTML fragment to links
 
     This function converts strings that look like URLs, domain names and email
@@ -157,7 +154,7 @@ def __init__(
             omit_optional_tags=False,
             # linkify does not sanitize
             sanitize=False,
-            # linkify alphabetizes
+            # linkify preserves attr order
             alphabetical_attributes=False,
         )
 
@@ -171,11 +168,9 @@ def linkify(self, text):
         :raises TypeError: if ``text`` is not a text type
 
         """
-        if not isinstance(text, six.string_types):
+        if not isinstance(text, str):
             raise TypeError("argument must be of text type")
 
-        text = force_unicode(text)
-
         if not text:
             return ""
 
@@ -320,11 +315,10 @@ def handle_email_addresses(self, src_iter):
                     else:
                         # Add an "a" tag for the new link
                         _text = attrs.pop("_text", "")
-                        attrs = alphabetize_attributes(attrs)
                         new_tokens.extend(
                             [
                                 {"type": "StartTag", "name": "a", "data": attrs},
-                                {"type": "Characters", "data": force_unicode(_text)},
+                                {"type": "Characters", "data": str(_text)},
                                 {"type": "EndTag", "name": "a"},
                             ]
                         )
@@ -443,12 +437,10 @@ def handle_links(self, src_iter):
                             new_tokens.append({"type": "Characters", "data": prefix})
 
                         _text = attrs.pop("_text", "")
-                        attrs = alphabetize_attributes(attrs)
-
                         new_tokens.extend(
                             [
                                 {"type": "StartTag", "name": "a", "data": attrs},
-                                {"type": "Characters", "data": force_unicode(_text)},
+                                {"type": "Characters", "data": str(_text)},
                                 {"type": "EndTag", "name": "a"},
                             ]
                         )
@@ -497,7 +489,7 @@ def handle_a_tag(self, token_buffer):
 
         else:
             new_text = attrs.pop("_text", "")
-            a_token["data"] = alphabetize_attributes(attrs)
+            a_token["data"] = attrs
 
             if text == new_text:
                 # The callbacks didn't change the text, so we yield the new "a"
@@ -511,7 +503,7 @@ def handle_a_tag(self, token_buffer):
                 # all the tokens between the start and end "a" tags and replace
                 # it with the new text
                 yield a_token
-                yield {"type": "Characters", "data": force_unicode(new_text)}
+                yield {"type": "Characters", "data": str(new_text)}
                 yield token_buffer[-1]
 
     def __iter__(self):

diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py
@@ -1,15 +1,12 @@
-from __future__ import unicode_literals
-
 from itertools import chain
 import re
+from typing import List
 import warnings
 
-import six
-from six.moves.urllib.parse import urlparse
+from urllib.parse import urlparse
 from xml.sax.saxutils import unescape
 
 from bleach import html5lib_shim
-from bleach.utils import alphabetize_attributes, force_unicode
 
 
 #: List of allowed tags
@@ -37,7 +34,7 @@
 }
 
 #: List of allowed styles
-ALLOWED_STYLES = []
+ALLOWED_STYLES: List[str] = []
 
 #: List of allowed protocols
 ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
@@ -55,7 +52,7 @@
 INVISIBLE_REPLACEMENT_CHAR = "?"
 
 
-class Cleaner(object):
+class Cleaner:
     """Cleaner for cleaning HTML fragments of malicious content
 
     This cleaner is a security-focused function whose sole purpose is to remove
@@ -146,7 +143,7 @@ def __init__(
             resolve_entities=False,
             # Bleach has its own sanitizer, so don't use the html5lib one
             sanitize=False,
-            # Bleach sanitizer alphabetizes already, so don't use the html5lib one
+            # clean preserves attr order
             alphabetical_attributes=False,
         )
 
@@ -160,7 +157,7 @@ def clean(self, text):
         :raises TypeError: if ``text`` is not a text type
 
         """
-        if not isinstance(text, six.string_types):
+        if not isinstance(text, str):
             message = (
                 "argument cannot be of '{name}' type, must be of text type".format(
                     name=text.__class__.__name__
@@ -171,8 +168,6 @@ def clean(self, text):
         if not text:
             return ""
 
-        text = force_unicode(text)
-
         dom = self.parser.parseFragment(text)
         filtered = BleachSanitizerFilter(
             source=self.walker(dom),
@@ -363,10 +358,6 @@ def sanitize_token(self, token):
                 return None
 
             else:
-                if "data" in token:
-                    # Alphabetize the attributes before calling .disallowed_token()
-                    # so that the resulting string is stable
-                    token["data"] = alphabetize_attributes(token["data"])
                 return self.disallowed_token(token)
 
         elif token_type == "Comment":
@@ -557,7 +548,7 @@ def allow_token(self, token):
                 # At this point, we want to keep the attribute, so add it in
                 attrs[namespaced_name] = val
 
-            token["data"] = alphabetize_attributes(attrs)
+            token["data"] = attrs
 
         return token
 

diff --git a/bleach/utils.py b/bleach/utils.py
diff --git a/docs/clean.rst b/docs/clean.rst
@@ -173,7 +173,7 @@ attributes for specified tags:
 
 .. doctest::
 
-   >>> from six.moves.urllib.parse import urlparse
+   >>> from urllib.parse import urlparse
    >>> import bleach
 
    >>> def allow_src(tag, name, value):
@@ -371,7 +371,7 @@ Trivial Filter example:
    >>> cleaner = Cleaner(tags=TAGS, attributes=ATTRS, filters=[MooFilter])
    >>> dirty = 'this is cute! <img src="http://example.com/puppy.jpg" rel="nofollow">'
    >>> cleaner.clean(dirty)
-   'this is cute! <img rel="moo" src="moo">'
+   'this is cute! <img src="moo" rel="moo">'
 
 
 .. Warning::