Skip to content

Commit

Permalink
pythongh-113028: Correctly memoize str in pickle when escapes added (p…
Browse files Browse the repository at this point in the history
…ythonGH-113436)

This fixes a divergence between the Python and C implementations of pickle
for protocol 0, such that it pickle.py fails to re-use the first pickled
representation of strings involving characters that have to be escaped.
  • Loading branch information
jeff5 committed Dec 24, 2023
1 parent 894f0e5 commit 0839863
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 7 deletions.
14 changes: 7 additions & 7 deletions Lib/pickle.py
Expand Up @@ -857,13 +857,13 @@ def save_str(self, obj):
else:
self.write(BINUNICODE + pack("<I", n) + encoded)
else:
obj = obj.replace("\\", "\\u005c")
obj = obj.replace("\0", "\\u0000")
obj = obj.replace("\n", "\\u000a")
obj = obj.replace("\r", "\\u000d")
obj = obj.replace("\x1a", "\\u001a") # EOF on DOS
self.write(UNICODE + obj.encode('raw-unicode-escape') +
b'\n')
# Escape what raw-unicode-escape doesn't, but memoize the original.
tmp = obj.replace("\\", "\\u005c")
tmp = tmp.replace("\0", "\\u0000")
tmp = tmp.replace("\n", "\\u000a")
tmp = tmp.replace("\r", "\\u000d")
tmp = tmp.replace("\x1a", "\\u001a") # EOF on DOS
self.write(UNICODE + tmp.encode('raw-unicode-escape') + b'\n')
self.memoize(obj)
dispatch[str] = save_str

Expand Down
8 changes: 8 additions & 0 deletions Lib/test/pickletester.py
Expand Up @@ -1825,6 +1825,14 @@ def test_unicode_high_plane(self):
t2 = self.loads(p)
self.assert_is_copy(t, t2)

def test_unicode_memoization(self):
# Repeated str is re-used (even when escapes added).
for proto in protocols:
for s in '', 'xyz', 'xyz\n', 'x\\yz', 'x\xa1yz\r':
p = self.dumps((s, s), proto)
s1, s2 = self.loads(p)
self.assertIs(s1, s2)

def test_bytes(self):
for proto in protocols:
for s in b'', b'xyz', b'xyz'*100:
Expand Down
@@ -0,0 +1,6 @@
When a second reference to a string appears in the input to :mod:`pickle`,
and the Python implementation is in use,
we are guaranteed that a single copy gets pickled
and a single object is shared when reloaded.
Previously, in protocol 0, when a string contained certain characters
(e.g. newline) it resulted in duplicate objects.

0 comments on commit 0839863

Please sign in to comment.