Refactor imap_utf7 module to make it easier to understand and fix issue

mjs#187
mlorant · Oct 10, 2017 · 9b9bc02 · 9b9bc02
1 parent 6cc77d7
commit 9b9bc02
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 65 deletions.
diff --git a/doc/src/releases.rst b/doc/src/releases.rst
@@ -6,7 +6,7 @@
 
 Added
 -----
-- Connection and read/write operations timeout can now be distinct, 
+- Connection and read/write operations timeout can now be distinct,
   using `imapclient.SocketTimeout` namedtuple as `timeout` parameter.
 - A context manager is introduced to automatically close connections to remote
   servers.
@@ -20,6 +20,12 @@ Changed
 - More precise exceptions available in `imapclient.exceptions` are raised when
   an error happens
 
+Fixed
+-----
+- Modified UTF-7 encoding function had quirks in its original algorithm,
+  leading to misencoded bytes data in some cases. The algorithm, described
+  in RFC 3501, have been reimplemented to fix #187 and is better documented.
+
 Other
 -----
 - Drop support of OAUTH(1)

diff --git a/imapclient/imap_utf7.py b/imapclient/imap_utf7.py
@@ -1,37 +1,17 @@
-# The contents of this file has been derived code from the Twisted project
-# (http://twistedmatrix.com/). The original author is Jp Calderone.
-
-# Twisted project license follows:
-
-# Permission is hereby granted, free of charge, to any person obtaining
-# a copy of this software and associated documentation files (the
-# "Software"), to deal in the Software without restriction, including
-# without limitation the rights to use, copy, modify, merge, publish,
-# distribute, sublicense, and/or sell copies of the Software, and to
-# permit persons to whom the Software is furnished to do so, subject to
-# the following conditions:
+# This file contains two main methods used to encode and decode UTF-7
+# string, described in the RFC 3501. There are some variations specific
+# to IMAP4rev1, so the built-in Python UTF-7 codec can't be used instead.
 #
-# The above copyright notice and this permission notice shall be
-# included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
+# The main difference is the shift character (used to switch from ASCII to
+# base64 encoding context), which is & in this modified UTF-7 convention,
+# since + is considered as mainly used in mailbox names. 
+# Other variations and examples can be found in the RFC 3501, section 5.1.3.
 from __future__ import unicode_literals
 
+import binascii
 from six import binary_type, text_type, byte2int, iterbytes, unichr
 
 
-PRINTABLE = set(range(0x20, 0x26)) | set(range(0x27, 0x7f))
-
-# TODO: module needs refactoring (e.g. variable names suck)
-
-
 def encode(s):
     """Encode a folder name using IMAP modified UTF-7 encoding.
 
@@ -41,27 +21,36 @@ def encode(s):
     if not isinstance(s, text_type):
         return s
 
-    r = []
-    _in = []
-
-    def extend_result_if_chars_buffered():
-        if _in:
-            r.extend([b'&', modified_utf7(''.join(_in)), b'-'])
-            del _in[:]
+    res = []
+    b64_buffer = []
+    def consume_b64_buffer(buf):
+        """
+        Consume the buffer by encoding it into a modified base 64 representation
+        and surround it with shift characters & and -
+        """
+        if b64_buffer:
+            res.extend([b'&', base64_utf7_encode(buf), b'-'])
+            del buf[:]
 
     for c in s:
-        if ord(c) in PRINTABLE:
-            extend_result_if_chars_buffered()
-            r.append(c.encode('latin-1'))
-        elif c == '&':
-            extend_result_if_chars_buffered()
-            r.append(b'&-')
+        # printable ascii case should not be modified
+        if 0x20 <= ord(c) <= 0x7e:
+            consume_b64_buffer(b64_buffer)
+            # Special case: & is used as shift character so we need to escape it in ASCII
+            if c == '&':
+                res.append(b'&-')
+            else:
+                res.append(c.encode('ascii'))
+
+        # Bufferize characters that will be encoded in base64 and append them later 
+        # in the result, when iterating over ASCII character or the end of string
         else:
-            _in.append(c)
+            b64_buffer.append(c)
 
-    extend_result_if_chars_buffered()
+    # Consume the remaining buffer if the string finish with non-ASCII characters
+    consume_b64_buffer(b64_buffer)
 
-    return b''.join(r)
+    return b''.join(res)
 
 
 AMPERSAND_ORD = byte2int(b'&')
@@ -75,35 +64,43 @@ def decode(s):
     unicode. If non-bytes/str input is provided, the input is returned
     unchanged.
     """
-
     if not isinstance(s, binary_type):
         return s
 
-    r = []
-    _in = bytearray()
+    res = []
+    # Store base64 substring that will be decoded once stepping on end shift character
+    b64_buffer = bytearray()
     for c in iterbytes(s):
-        if c == AMPERSAND_ORD and not _in:
-            _in.append(c)
-        elif c == DASH_ORD and _in:
-            if len(_in) == 1:
-                r.append('&')
+        # Shift character without anything in buffer -> starts storing base64 substring
+        if c == AMPERSAND_ORD and not b64_buffer:
+            b64_buffer.append(c)
+        # End shift char. -> append the decoded buffer to the result and reset it
+        elif c == DASH_ORD and b64_buffer:
+            # Special case &-, representing "&" escaped
+            if len(b64_buffer) == 1:
+                res.append('&')
             else:
-                r.append(modified_deutf7(_in[1:]))
-            _in = bytearray()
-        elif _in:
-            _in.append(c)
+                res.append(base64_utf7_decode(b64_buffer[1:]))
+            b64_buffer = bytearray()
+        # Still buffering between the shift character and the shift back to ASCII
+        elif b64_buffer:
+            b64_buffer.append(c)
+        # No buffer initialized yet, should be an ASCII printable char
         else:
-            r.append(unichr(c))
-    if _in:
-        r.append(modified_deutf7(_in[1:]))
-    return ''.join(r)
+            res.append(unichr(c))
+
+    # Decode the remaining buffer if any
+    if b64_buffer:
+        res.append(base64_utf7_decode(b64_buffer[1:]))
+
+    return ''.join(res)
 
 
-def modified_utf7(s):
-    s_utf7 = s.encode('utf-7')
-    return s_utf7[1:-1].replace(b'/', b',')
+def base64_utf7_encode(buffer):
+    s = ''.join(buffer).encode('utf-16be')
+    return binascii.b2a_base64(s).rstrip(b'\n=').replace(b'/', b',')
 
 
-def modified_deutf7(s):
+def base64_utf7_decode(s):
     s_utf7 = b'+' + s.replace(b',', b'/') + b'-'
     return s_utf7.decode('utf-7')
diff --git a/tests/test_imap_utf7.py b/tests/test_imap_utf7.py
@@ -22,6 +22,7 @@ class IMAP4UTF7TestCase(unittest.TestCase):
         ['~peter/mail/\u65e5\u672c\u8a9e/\u53f0\u5317',
          b'~peter/mail/&ZeVnLIqe-/&U,BTFw-'],  # example from RFC 2060
         ['\x00foo', b'&AAA-foo'],
+        ['foo\r\n\nbar\n', b'foo&AA0ACgAK-bar&AAo-'] # see imapclient/#187 issue
     ]
 
     def test_encode(self):