Permalink
Browse files

Move Utf8Encode to libstr, in preparation for decoding.

  • Loading branch information...
Andy Chu
Andy Chu committed Jun 24, 2018
1 parent 294f54f commit 5441874f19502195e8a602e74a322e0895a47664
Showing with 52 additions and 48 deletions.
  1. +34 −0 core/libstr.py
  2. +15 −0 core/libstr_test.py
  3. +2 −34 core/word_compile.py
  4. +1 −14 core/word_compile_test.py
View
@@ -22,6 +22,40 @@
log = util.log
e_die = util.e_die
def Utf8Encode(code):
"""Return utf-8 encoded bytes from a unicode code point.
Based on https://stackoverflow.com/a/23502707
"""
num_cont_bytes = 0
if code <= 0x7F:
return chr(code & 0x7F) # ASCII
elif code <= 0x7FF:
num_cont_bytes = 1
elif code <= 0xFFFF:
num_cont_bytes = 2
elif code <= 0x10FFFF:
num_cont_bytes = 3
else:
return '\xEF\xBF\xBD' # unicode replacement character
bytes_ = []
for _ in xrange(num_cont_bytes):
bytes_.append(0x80 | (code & 0x3F))
code >>= 6
b = (0x1E << (6-num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
bytes_.append(b)
bytes_.reverse()
# mod 256 because Python ints don't wrap around!
return ''.join(chr(b & 0xFF) for b in bytes_)
# Implementation without Python regex:
#
# (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
View
@@ -11,6 +11,21 @@
class LibStrTest(unittest.TestCase):
def testUtf8Encode(self):
CASES = [
(u'\u0065'.encode('utf-8'), 0x0065),
(u'\u0100'.encode('utf-8'), 0x0100),
(u'\u1234'.encode('utf-8'), 0x1234),
(u'\U00020000'.encode('utf-8'), 0x00020000),
# Out of range gives Unicode replacement character.
('\xef\xbf\xbd', 0x10020000),
]
for expected, code_point in CASES:
print('')
print('Utf8Encode case %r %r' % (expected, code_point))
self.assertEqual(expected, libstr.Utf8Encode(code_point))
def testUnarySuffixOpDemo(self):
print(libstr)
View
@@ -7,6 +7,7 @@
"""
from core import util
from core import libstr
from osh.meta import Id
from osh.meta import runtime
@@ -31,39 +32,6 @@
}
def Utf8Encode(code):
"""Return utf-8 encoded bytes from a unicode code point.
Based on https://stackoverflow.com/a/23502707
"""
num_cont_bytes = 0
if code <= 0x7F:
return chr(code & 0x7F) # ASCII
elif code <= 0x7FF:
num_cont_bytes = 1
elif code <= 0xFFFF:
num_cont_bytes = 2
elif code <= 0x10FFFF:
num_cont_bytes = 3
else:
return '\xEF\xBF\xBD' # unicode replacement character
bytes_ = []
for _ in xrange(num_cont_bytes):
bytes_.append(0x80 | (code & 0x3F))
code >>= 6
b = (0x1E << (6-num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
bytes_.append(b)
bytes_.reverse()
# mod 256 because Python ints don't wrap around!
return ''.join(chr(b & 0xFF) for b in bytes_)
# TODO: Strict mode syntax errors:
#
# \x is a syntax error -- needs two digits (It's like this in C)
@@ -114,7 +82,7 @@ def EvalCStringToken(id_, value):
s = value[2:]
i = int(s, 16)
#util.log('i = %d', i)
return Utf8Encode(i)
return libstr.Utf8Encode(i)
else:
raise AssertionError
View
@@ -9,21 +9,8 @@
class WordCompileTest(unittest.TestCase):
pass
def testUtf8Encode(self):
CASES = [
(u'\u0065'.encode('utf-8'), 0x0065),
(u'\u0100'.encode('utf-8'), 0x0100),
(u'\u1234'.encode('utf-8'), 0x1234),
(u'\U00020000'.encode('utf-8'), 0x00020000),
# Out of range gives Unicode replacement character.
('\xef\xbf\xbd', 0x10020000),
]
for expected, code_point in CASES:
print('')
print('Utf8Encode case %r %r' % (expected, code_point))
self.assertEqual(expected, word_compile.Utf8Encode(code_point))
if __name__ == '__main__':
unittest.main()

0 comments on commit 5441874

Please sign in to comment.