Permalink
Browse files

Do utf-8 encoding of \u1234 in pure Python.

We want to remove the dependency on Python unicode objects.

Co-authored-by: Emanuel Geromin <egeromin@users.noreply.github.com>
  • Loading branch information...
Andy Chu and egeromin committed Jun 5, 2018
1 parent 17cf4c9 commit 4f19662c0c930ea09e4e5fa79a7fcae258529434
Showing with 70 additions and 3 deletions.
  1. +34 −1 core/word_compile.py
  2. +29 −0 core/word_compile_test.py
  3. +1 −0 native/libc_test.py
  4. +6 −2 spec/builtin-io.test.sh
View
35 core/word_compile.py 100644 → 100755
@@ -30,6 +30,38 @@
'"': '"', # not sure why this is escaped within $''
}
def Utf8Encode(code):
"""Return utf-8 encoded bytes from a unicode code point.
Based on https://stackoverflow.com/a/23502707
"""
if code <= 0x7F:
bytes_ = [code & 0x7F]
elif code > 0x10FFFF:
bytes_ = [0xEF, 0xBF, 0xBD] # unicode replacement character
else:
if code <= 0x7FF:
num_cont_bytes = 1
elif code <= 0xFFFF:
num_cont_bytes = 2
else:
num_cont_bytes = 3
bytes_ = []
for _ in xrange(num_cont_bytes):
bytes_.append(0x80 | (code & 0x3F))
code >>= 6
b = (0x1E << (6-num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
bytes_.append(b)
bytes_.reverse()
# mod 256 because Python ints don't wrap around!
return "".join(chr(b & 0xFF) for b in bytes_)
# TODO: Strict mode syntax errors:
#
# \x is a syntax error -- needs two digits (It's like this in C)
@@ -79,7 +111,8 @@ def EvalCStringToken(id_, value):
elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
s = value[2:]
i = int(s, 16)
return unichr(i).encode('utf-8') # Stay in the realm of bytes
#util.log('i = %d', i)
return Utf8Encode(i)
else:
raise AssertionError
View
@@ -0,0 +1,29 @@
#!/usr/bin/python -S
"""
word_compile_test.py: Tests for word_compile.py
"""
import unittest
from core import word_compile # module under test
class WordCompileTest(unittest.TestCase):
def testUtf8Encode(self):
CASES = [
(u'\u0065'.encode('utf-8'), 0x0065),
(u'\u0100'.encode('utf-8'), 0x0100),
(u'\u1234'.encode('utf-8'), 0x1234),
(u'\U00020000'.encode('utf-8'), 0x00020000),
# Out of range gives Unicode replacement character.
('\xef\xbf\xbd', 0x10020000),
]
for expected, code_point in CASES:
print('')
print('Utf8Encode case %r %r' % (expected, code_point))
self.assertEqual(expected, word_compile.Utf8Encode(code_point))
if __name__ == '__main__':
unittest.main()
View
@@ -71,6 +71,7 @@ def testRegex(self):
]
for pat, s, expected in cases:
print('CASE %s' % pat)
actual = libc.regex_match(pat, s)
self.assertEqual(expected, actual)
View
@@ -124,15 +124,19 @@ flags='-e'
case $SH in */dash) flags='' ;; esac
echo $flags 'abcd\u0065f'
# stdout-json: "abcdef\n"
## STDOUT:
abcdef
## END
# N-I dash/ash stdout-json: "abcd\\u0065f\n"
### echo -e with 8 digit unicode escape
flags='-e'
case $SH in */dash) flags='' ;; esac
echo $flags 'abcd\U00000065f'
# stdout-json: "abcdef\n"
## STDOUT:
abcdef
## END
# N-I dash/ash stdout-json: "abcd\\U00000065f\n"
### \0377 is the highest octal byte

0 comments on commit 4f19662

Please sign in to comment.